diff --git a/Cargo.lock b/Cargo.lock index 601af664ec..96262f9e77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3168,6 +3168,7 @@ version = "0.5.0" dependencies = [ "bindgen 0.66.1", "cc", + "glob", "libsql-wasmtime-bindings", ] diff --git a/libsql-ffi/Cargo.toml b/libsql-ffi/Cargo.toml index 099bfed400..13aa90d23f 100644 --- a/libsql-ffi/Cargo.toml +++ b/libsql-ffi/Cargo.toml @@ -16,6 +16,7 @@ libsql-wasmtime-bindings = { version = "0.2.1", optional = true } [build-dependencies] bindgen = "0.66.1" cc = "1.0" +glob = "0.3" [features] session = [] @@ -28,3 +29,4 @@ wasm32-wasi-vfs = [] unlock_notify = [] preupdate_hook = [] sqlcipher = [] +sqlean-extensions = [] diff --git a/libsql-ffi/build.rs b/libsql-ffi/build.rs index 55f25ec5cf..0575a06263 100644 --- a/libsql-ffi/build.rs +++ b/libsql-ffi/build.rs @@ -1,3 +1,4 @@ +use glob::glob; use std::env; use std::ffi::OsString; use std::fs::{self, OpenOptions}; @@ -146,8 +147,7 @@ pub fn build_bundled(out_dir: &str, out_path: &Path) { std::fs::copy(format!("{dir}/{bindgen_rs_path}"), out_path).unwrap(); let mut cfg = cc::Build::new(); - cfg.file(format!("{BUNDLED_DIR}/src/sqlite3.c")) - .flag("-std=c11") + cfg.flag("-std=c11") .flag("-DSQLITE_CORE") .flag("-DSQLITE_DEFAULT_FOREIGN_KEYS=1") .flag("-DSQLITE_ENABLE_API_ARMOR") @@ -169,6 +169,36 @@ pub fn build_bundled(out_dir: &str, out_path: &Path) { .flag("-D_POSIX_THREAD_SAFE_FUNCTIONS") // cross compile with MinGW .warnings(false); + if cfg!(feature = "sqlean-extensions") { + cfg.flag("-DSQLITE_EXTRA_INIT=core_init"); + let sqlean_patterns = [ + "crypto/*.c", + "fuzzy/*.c", + "math/*.c", + "stats/*.c", + "text/*.c", + "text/*/*.c", + "uuid/*.c", + ]; + + let mut sqlean_sources = Vec::new(); + for pattern in sqlean_patterns { + let full_pattern = format!("{BUNDLED_DIR}/sqlean/{}", pattern); + sqlean_sources.extend(glob(&full_pattern).unwrap().filter_map(Result::ok)); + } + + cfg.files(sqlean_sources); + + let sqlean_entry = format!("{BUNDLED_DIR}/src/sqlite3-sqlean-stripped.c"); + let sqlean = format!("{BUNDLED_DIR}/src/sqlean.c"); + cfg.file(sqlean_entry); + cfg.file(sqlean); + + cfg.include(format!("{BUNDLED_DIR}/sqlean/")); + } else { + cfg.file(format!("{BUNDLED_DIR}/src/sqlite3.c")); + } + if cfg!(feature = "wasmtime-bindings") { cfg.flag("-DLIBSQL_ENABLE_WASM_RUNTIME=1"); } diff --git a/libsql-ffi/bundled/sqlean/crypto/base32.c b/libsql-ffi/bundled/sqlean/crypto/base32.c new file mode 100644 index 0000000000..9381b00584 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/base32.c @@ -0,0 +1,82 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Base32 encoding/decoding (RFC 4648) + +#include +#include +#include + +static const char base32_chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; + +uint8_t* base32_encode(const uint8_t* src, size_t len, size_t* out_len) { + *out_len = ((len + 4) / 5) * 8; + uint8_t* encoded = malloc(*out_len + 1); + if (encoded == NULL) { + *out_len = 0; + return NULL; + } + + for (size_t i = 0, j = 0; i < len;) { + uint32_t octet0 = i < len ? src[i++] : 0; + uint32_t octet1 = i < len ? src[i++] : 0; + uint32_t octet2 = i < len ? src[i++] : 0; + uint32_t octet3 = i < len ? src[i++] : 0; + uint32_t octet4 = i < len ? src[i++] : 0; + + encoded[j++] = base32_chars[octet0 >> 3]; + encoded[j++] = base32_chars[((octet0 & 0x07) << 2) | (octet1 >> 6)]; + encoded[j++] = base32_chars[(octet1 >> 1) & 0x1F]; + encoded[j++] = base32_chars[((octet1 & 0x01) << 4) | (octet2 >> 4)]; + encoded[j++] = base32_chars[((octet2 & 0x0F) << 1) | (octet3 >> 7)]; + encoded[j++] = base32_chars[(octet3 >> 2) & 0x1F]; + encoded[j++] = base32_chars[((octet3 & 0x03) << 3) | (octet4 >> 5)]; + encoded[j++] = base32_chars[octet4 & 0x1F]; + } + + if (len % 5 != 0) { + size_t padding = 7 - (len % 5) * 8 / 5; + for (size_t i = 0; i < padding; i++) { + encoded[*out_len - padding + i] = '='; + } + } + + encoded[*out_len] = '\0'; + return encoded; +} + +uint8_t* base32_decode(const uint8_t* src, size_t len, size_t* out_len) { + while (len > 0 && src[len - 1] == '=') { + len--; + } + *out_len = len * 5 / 8; + uint8_t* decoded = malloc(*out_len); + if (decoded == NULL) { + *out_len = 0; + return NULL; + } + + size_t bits = 0, value = 0, count = 0; + for (size_t i = 0; i < len; i++) { + uint8_t c = src[i]; + if (c >= 'A' && c <= 'Z') { + c -= 'A'; + } else if (c >= '2' && c <= '7') { + c -= '2' - 26; + } else { + continue; + } + value = (value << 5) | c; + bits += 5; + if (bits >= 8) { + decoded[count++] = (uint8_t)(value >> (bits - 8)); + bits -= 8; + } + } + if (bits >= 5 || (value & ((1 << bits) - 1)) != 0) { + free(decoded); + return NULL; + } + *out_len = count; + return decoded; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/base32.h b/libsql-ffi/bundled/sqlean/crypto/base32.h new file mode 100644 index 0000000000..4221ea22d6 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/base32.h @@ -0,0 +1,14 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Base32 encoding/decoding (RFC 4648) + +#ifndef _BASE32_H_ +#define _BASE32_H_ + +#include + +uint8_t* base32_encode(const uint8_t* src, size_t len, size_t* out_len); +uint8_t* base32_decode(const uint8_t* src, size_t len, size_t* out_len); + +#endif /* _BASE32_H_ */ diff --git a/libsql-ffi/bundled/sqlean/crypto/base64.c b/libsql-ffi/bundled/sqlean/crypto/base64.c new file mode 100644 index 0000000000..dccc599136 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/base64.c @@ -0,0 +1,103 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Base64 encoding/decoding (RFC 4648) + +#include +#include +#include + +static const char base64_chars[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +uint8_t* base64_encode(const uint8_t* src, size_t len, size_t* out_len) { + uint8_t* encoded = NULL; + size_t i, j; + uint32_t octets; + + *out_len = ((len + 2) / 3) * 4; + encoded = malloc(*out_len + 1); + if (encoded == NULL) { + *out_len = 0; + return NULL; + } + + for (i = 0, j = 0; i < len; i += 3, j += 4) { + octets = + (src[i] << 16) | ((i + 1 < len ? src[i + 1] : 0) << 8) | (i + 2 < len ? src[i + 2] : 0); + encoded[j] = base64_chars[(octets >> 18) & 0x3f]; + encoded[j + 1] = base64_chars[(octets >> 12) & 0x3f]; + encoded[j + 2] = base64_chars[(octets >> 6) & 0x3f]; + encoded[j + 3] = base64_chars[octets & 0x3f]; + } + + if (len % 3 == 1) { + encoded[*out_len - 1] = '='; + encoded[*out_len - 2] = '='; + } else if (len % 3 == 2) { + encoded[*out_len - 1] = '='; + } + + encoded[*out_len] = '\0'; + return encoded; +} + +static const uint8_t base64_table[] = { + // Map base64 characters to their corresponding values + ['A'] = 0, ['B'] = 1, ['C'] = 2, ['D'] = 3, ['E'] = 4, ['F'] = 5, ['G'] = 6, ['H'] = 7, + ['I'] = 8, ['J'] = 9, ['K'] = 10, ['L'] = 11, ['M'] = 12, ['N'] = 13, ['O'] = 14, ['P'] = 15, + ['Q'] = 16, ['R'] = 17, ['S'] = 18, ['T'] = 19, ['U'] = 20, ['V'] = 21, ['W'] = 22, ['X'] = 23, + ['Y'] = 24, ['Z'] = 25, ['a'] = 26, ['b'] = 27, ['c'] = 28, ['d'] = 29, ['e'] = 30, ['f'] = 31, + ['g'] = 32, ['h'] = 33, ['i'] = 34, ['j'] = 35, ['k'] = 36, ['l'] = 37, ['m'] = 38, ['n'] = 39, + ['o'] = 40, ['p'] = 41, ['q'] = 42, ['r'] = 43, ['s'] = 44, ['t'] = 45, ['u'] = 46, ['v'] = 47, + ['w'] = 48, ['x'] = 49, ['y'] = 50, ['z'] = 51, ['0'] = 52, ['1'] = 53, ['2'] = 54, ['3'] = 55, + ['4'] = 56, ['5'] = 57, ['6'] = 58, ['7'] = 59, ['8'] = 60, ['9'] = 61, ['+'] = 62, ['/'] = 63, +}; + +uint8_t* base64_decode(const uint8_t* src, size_t len, size_t* out_len) { + if (len % 4 != 0) { + return NULL; + } + + size_t padding = 0; + if (src[len - 1] == '=') { + padding++; + } + if (src[len - 2] == '=') { + padding++; + } + + *out_len = (len / 4) * 3 - padding; + uint8_t* decoded = malloc(*out_len); + if (decoded == NULL) { + *out_len = 0; + return NULL; + } + + for (size_t i = 0, j = 0; i < len; i += 4, j += 3) { + uint32_t block = 0; + for (size_t k = 0; k < 4; k++) { + block <<= 6; + if (src[i + k] == '=') { + padding--; + } else { + uint8_t index = base64_table[src[i + k]]; + if (index == 0 && src[i + k] != 'A') { + free(decoded); + return NULL; + } + block |= index; + } + } + + decoded[j] = (block >> 16) & 0xFF; + if (j + 1 < *out_len) { + decoded[j + 1] = (block >> 8) & 0xFF; + } + if (j + 2 < *out_len) { + decoded[j + 2] = block & 0xFF; + } + } + + return decoded; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/base64.h b/libsql-ffi/bundled/sqlean/crypto/base64.h new file mode 100644 index 0000000000..cc79560508 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/base64.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Base64 encoding/decoding (RFC 4648) + +#ifndef BASE64_H +#define BASE64_H + +#include +#include + +uint8_t* base64_encode(const uint8_t* src, size_t len, size_t* out_len); +uint8_t* base64_decode(const uint8_t* src, size_t len, size_t* out_len); + +#endif /* BASE64_H */ diff --git a/libsql-ffi/bundled/sqlean/crypto/base85.c b/libsql-ffi/bundled/sqlean/crypto/base85.c new file mode 100644 index 0000000000..4bfac0f199 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/base85.c @@ -0,0 +1,118 @@ +// Originally by Fränz Friederes, MIT License +// https://github.com/cryptii/cryptii/blob/main/src/Encoder/Ascii85.js + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// Base85 (Ascii85) encoding/decoding + +#include +#include +#include +#include + +uint8_t* base85_encode(const uint8_t* src, size_t len, size_t* out_len) { + uint8_t* encoded = malloc(len * 5 / 4 + 5); + if (encoded == NULL) { + *out_len = 0; + return NULL; + } + + // Encode each tuple of 4 bytes + uint32_t digits[5], tuple; + size_t pos = 0; + for (size_t i = 0; i < len; i += 4) { + // Read 32-bit unsigned integer from bytes following the + // big-endian convention (most significant byte first) + tuple = (((src[i]) << 24) + ((src[i + 1] << 16) & 0xFF0000) + ((src[i + 2] << 8) & 0xFF00) + + ((src[i + 3]) & 0xFF)); + + if (tuple > 0) { + // Calculate 5 digits by repeatedly dividing + // by 85 and taking the remainder + for (size_t j = 0; j < 5; j++) { + digits[4 - j] = tuple % 85; + tuple = tuple / 85; + } + + // Omit final characters added due to bytes of padding + size_t num_padding = 0; + if (len < i + 4) { + num_padding = (i + 4) - len; + } + for (size_t j = 0; j < 5 - num_padding; j++) { + encoded[pos++] = digits[j] + 33; + } + } else { + // An all-zero tuple is encoded as a single character + encoded[pos++] = 'z'; + } + } + + *out_len = len * 5 / 4 + (len % 4 ? 1 : 0); + encoded[*out_len] = '\0'; + return encoded; +} + +uint8_t* base85_decode(const uint8_t* src, size_t len, size_t* out_len) { + uint8_t* decoded = malloc(len * 4 / 5); + if (decoded == NULL) { + *out_len = 0; + return NULL; + } + + uint8_t digits[5], tupleBytes[4]; + uint32_t tuple; + size_t pos = 0; + for (size_t i = 0; i < len;) { + if (src[i] == 'z') { + // A single character encodes an all-zero tuple + decoded[pos++] = 0; + decoded[pos++] = 0; + decoded[pos++] = 0; + decoded[pos++] = 0; + i++; + } else { + // Retrieve radix-85 digits of tuple + for (int k = 0; k < 5; k++) { + if (i + k < len) { + uint8_t digit = src[i + k] - 33; + if (digit < 0 || digit > 84) { + *out_len = 0; + free(decoded); + return NULL; + } + digits[k] = digit; + } else { + digits[k] = 84; // Pad with 'u' + } + } + + // Create 32-bit binary number from digits and handle padding + // tuple = a * 85^4 + b * 85^3 + c * 85^2 + d * 85 + e + tuple = digits[0] * 52200625 + digits[1] * 614125 + digits[2] * 7225 + digits[3] * 85 + + digits[4]; + + // Get bytes from tuple + tupleBytes[0] = (tuple >> 24) & 0xff; + tupleBytes[1] = (tuple >> 16) & 0xff; + tupleBytes[2] = (tuple >> 8) & 0xff; + tupleBytes[3] = tuple & 0xff; + + // Remove bytes of padding + int padding = 0; + if (i + 4 >= len) { + padding = i + 4 - len; + } + + // Append bytes to result + for (int k = 0; k < 4 - padding; k++) { + decoded[pos++] = tupleBytes[k]; + } + i += 5; + } + } + + *out_len = len * 4 / 5; + return decoded; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/base85.h b/libsql-ffi/bundled/sqlean/crypto/base85.h new file mode 100644 index 0000000000..554473aaf7 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/base85.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Base85 (Ascii85) encoding/decoding + +#ifndef _BASE85_H_ +#define _BASE85_H_ + +#include +#include + +uint8_t* base85_encode(const uint8_t* src, size_t len, size_t* out_len); +uint8_t* base85_decode(const uint8_t* src, size_t len, size_t* out_len); + +#endif /* _BASE85_H_ */ diff --git a/libsql-ffi/bundled/sqlean/crypto/blake3.c b/libsql-ffi/bundled/sqlean/crypto/blake3.c new file mode 100644 index 0000000000..044c307c0e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/blake3.c @@ -0,0 +1,25 @@ +// Created by: Peter Tripp (@notpeter) +// Public Domain + +#include +#include +#include "crypto/blake3.h" + +void* blake3_init() { + blake3_hasher* context; + context = malloc(sizeof(blake3_hasher)); + if (!context) + return NULL; + blake3_hasher_init(context); + return context; +} + +void blake3_update(blake3_hasher* ctx, const unsigned char* data, size_t len) { + blake3_hasher_update(ctx, data, len); +} + +int blake3_final(blake3_hasher* ctx, unsigned char hash[]) { + blake3_hasher_finalize(ctx, hash, BLAKE3_OUT_LEN); + free(ctx); + return BLAKE3_OUT_LEN; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/blake3.h b/libsql-ffi/bundled/sqlean/crypto/blake3.h new file mode 100644 index 0000000000..4d9c1e418b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/blake3.h @@ -0,0 +1,13 @@ +// Created by: Peter Tripp (@notpeter) +// Public Domain + +#ifndef __BLAKE3_H__ +#define __BLAKE3_H__ + +#include "crypto/blake3_reference_impl.h" + +void* blake3_init(); +void blake3_update(blake3_hasher* ctx, const unsigned char data[], size_t len); +int blake3_final(blake3_hasher* ctx, unsigned char hash[]); + +#endif diff --git a/libsql-ffi/bundled/sqlean/crypto/blake3_reference_impl.c b/libsql-ffi/bundled/sqlean/crypto/blake3_reference_impl.c new file mode 100644 index 0000000000..cd809338d1 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/blake3_reference_impl.c @@ -0,0 +1,368 @@ +// Originally from blake3 reference implementation, Public Domain +// https://github.com/oconnor663/blake3_reference_impl_c + +#include +#include + +#include "crypto/blake3_reference_impl.h" + +#define CHUNK_START 1 << 0 +#define CHUNK_END 1 << 1 +#define PARENT 1 << 2 +#define ROOT 1 << 3 +#define KEYED_HASH 1 << 4 +#define DERIVE_KEY_CONTEXT 1 << 5 +#define DERIVE_KEY_MATERIAL 1 << 6 + +static uint32_t IV[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +}; + +static size_t MSG_PERMUTATION[16] = {2, 6, 3, 10, 7, 0, 4, 13, + 1, 11, 12, 5, 9, 14, 15, 8}; + +inline static uint32_t rotate_right(uint32_t x, int n) { + return (x >> n) | (x << (32 - n)); +} + +// The mixing function, G, which mixes either a column or a diagonal. +inline static void g(uint32_t state[16], size_t a, size_t b, size_t c, size_t d, + uint32_t mx, uint32_t my) { + state[a] = state[a] + state[b] + mx; + state[d] = rotate_right(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotate_right(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + my; + state[d] = rotate_right(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotate_right(state[b] ^ state[c], 7); +} + +inline static void round_function(uint32_t state[16], uint32_t m[16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +inline static void permute(uint32_t m[16]) { + uint32_t permuted[16]; + for (size_t i = 0; i < 16; i++) { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + memcpy(m, permuted, sizeof(permuted)); +} + +inline static void compress(const uint32_t chaining_value[8], + const uint32_t block_words[16], uint64_t counter, + uint32_t block_len, uint32_t flags, + uint32_t out[16]) { + uint32_t state[16] = { + chaining_value[0], + chaining_value[1], + chaining_value[2], + chaining_value[3], + chaining_value[4], + chaining_value[5], + chaining_value[6], + chaining_value[7], + IV[0], + IV[1], + IV[2], + IV[3], + (uint32_t)counter, + (uint32_t)(counter >> 32), + block_len, + flags, + }; + uint32_t block[16]; + memcpy(block, block_words, sizeof(block)); + + round_function(state, block); // round 1 + permute(block); + round_function(state, block); // round 2 + permute(block); + round_function(state, block); // round 3 + permute(block); + round_function(state, block); // round 4 + permute(block); + round_function(state, block); // round 5 + permute(block); + round_function(state, block); // round 6 + permute(block); + round_function(state, block); // round 7 + + for (size_t i = 0; i < 8; i++) { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + + memcpy(out, state, sizeof(state)); +} + +inline static void words_from_little_endian_bytes(const void *bytes, + size_t bytes_len, + uint32_t *out) { + assert(bytes_len % 4 == 0); + const uint8_t *u8_ptr = (const uint8_t *)bytes; + for (size_t i = 0; i < (bytes_len / 4); i++) { + out[i] = ((uint32_t)(*u8_ptr++)); + out[i] += ((uint32_t)(*u8_ptr++)) << 8; + out[i] += ((uint32_t)(*u8_ptr++)) << 16; + out[i] += ((uint32_t)(*u8_ptr++)) << 24; + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +typedef struct output { + uint32_t input_chaining_value[8]; + uint32_t block_words[16]; + uint64_t counter; + uint32_t block_len; + uint32_t flags; +} output; + +inline static void output_chaining_value(const output *self, uint32_t out[8]) { + uint32_t out16[16]; + compress(self->input_chaining_value, self->block_words, self->counter, + self->block_len, self->flags, out16); + memcpy(out, out16, 8 * 4); +} + +inline static void output_root_bytes(const output *self, void *out, + size_t out_len) { + uint8_t *out_u8 = (uint8_t *)out; + uint64_t output_block_counter = 0; + while (out_len > 0) { + uint32_t words[16]; + compress(self->input_chaining_value, self->block_words, + output_block_counter, self->block_len, self->flags | ROOT, words); + for (size_t word = 0; word < 16; word++) { + for (int byte = 0; byte < 4; byte++) { + if (out_len == 0) { + return; + } + *out_u8 = (uint8_t)(words[word] >> (8 * byte)); + out_u8++; + out_len--; + } + } + output_block_counter++; + } +} + +inline static void chunk_state_init(_blake3_chunk_state *self, + const uint32_t key_words[8], + uint64_t chunk_counter, uint32_t flags) { + memcpy(self->chaining_value, key_words, sizeof(self->chaining_value)); + self->chunk_counter = chunk_counter; + memset(self->block, 0, sizeof(self->block)); + self->block_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +inline static size_t chunk_state_len(const _blake3_chunk_state *self) { + return BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed + + (size_t)self->block_len; +} + +inline static uint32_t chunk_state_start_flag(const _blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +inline static void chunk_state_update(_blake3_chunk_state *self, + const void *input, size_t input_len) { + const uint8_t *input_u8 = (const uint8_t *)input; + while (input_len > 0) { + // If the block buffer is full, compress it and clear it. More input is + // coming, so this compression is not CHUNK_END. + if (self->block_len == BLAKE3_BLOCK_LEN) { + uint32_t block_words[16]; + words_from_little_endian_bytes(self->block, BLAKE3_BLOCK_LEN, + block_words); + uint32_t out16[16]; + compress(self->chaining_value, block_words, self->chunk_counter, + BLAKE3_BLOCK_LEN, self->flags | chunk_state_start_flag(self), + out16); + memcpy(self->chaining_value, out16, sizeof(self->chaining_value)); + self->blocks_compressed++; + memset(self->block, 0, sizeof(self->block)); + self->block_len = 0; + } + + // Copy input bytes into the block buffer. + size_t want = BLAKE3_BLOCK_LEN - (size_t)self->block_len; + size_t take = want; + if (input_len < want) { + take = input_len; + } + memcpy(&self->block[(size_t)self->block_len], input_u8, take); + self->block_len += (uint8_t)take; + input_u8 += take; + input_len -= take; + } +} + +inline static output chunk_state_output(const _blake3_chunk_state *self) { + output ret; + memcpy(ret.input_chaining_value, self->chaining_value, + sizeof(ret.input_chaining_value)); + words_from_little_endian_bytes(self->block, sizeof(self->block), + ret.block_words); + ret.counter = self->chunk_counter; + ret.block_len = (uint32_t)self->block_len; + ret.flags = self->flags | chunk_state_start_flag(self) | CHUNK_END; + return ret; +} + +inline static output parent_output(const uint32_t left_child_cv[8], + const uint32_t right_child_cv[8], + const uint32_t key_words[8], + uint32_t flags) { + output ret; + memcpy(ret.input_chaining_value, key_words, sizeof(ret.input_chaining_value)); + memcpy(&ret.block_words[0], left_child_cv, 8 * 4); + memcpy(&ret.block_words[8], right_child_cv, 8 * 4); + ret.counter = 0; // Always 0 for parent nodes. + ret.block_len = + BLAKE3_BLOCK_LEN; // Always BLAKE3_BLOCK_LEN (64) for parent nodes. + ret.flags = PARENT | flags; + return ret; +} + +inline static void parent_cv(const uint32_t left_child_cv[8], + const uint32_t right_child_cv[8], + const uint32_t key_words[8], uint32_t flags, + uint32_t out[8]) { + output o = parent_output(left_child_cv, right_child_cv, key_words, flags); + // We only write to `out` after we've read the inputs. That makes it safe for + // `out` to alias an input, which we do below. + output_chaining_value(&o, out); +} + +inline static void hasher_init_internal(blake3_hasher *self, + const uint32_t key_words[8], + uint32_t flags) { + chunk_state_init(&self->chunk_state, key_words, 0, flags); + memcpy(self->key_words, key_words, sizeof(self->key_words)); + self->cv_stack_len = 0; + self->flags = flags; +} + +// Construct a new `Hasher` for the regular hash function. +void blake3_hasher_init(blake3_hasher *self) { + hasher_init_internal(self, IV, 0); +} + +// Construct a new `Hasher` for the keyed hash function. +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + words_from_little_endian_bytes(key, BLAKE3_KEY_LEN, key_words); + hasher_init_internal(self, key_words, KEYED_HASH); +} + +// Construct a new `Hasher` for the key derivation function. The context +// string should be hardcoded, globally unique, and application-specific. +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher context_hasher; + hasher_init_internal(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, strlen(context)); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + words_from_little_endian_bytes(context_key, BLAKE3_KEY_LEN, + context_key_words); + hasher_init_internal(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +inline static void hasher_push_stack(blake3_hasher *self, + const uint32_t cv[8]) { + memcpy(&self->cv_stack[(size_t)self->cv_stack_len * 8], cv, 8 * 4); + self->cv_stack_len++; +} + +// Returns a pointer to the popped CV, which is valid until the next push. +inline static const uint32_t *hasher_pop_stack(blake3_hasher *self) { + self->cv_stack_len--; + return &self->cv_stack[(size_t)self->cv_stack_len * 8]; +} + +// Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. +inline static void hasher_add_chunk_cv(blake3_hasher *self, uint32_t new_cv[8], + uint64_t total_chunks) { + // This chunk might complete some subtrees. For each completed subtree, its + // left child will be the current top entry in the CV stack, and its right + // child will be the current value of `new_cv`. Pop each left child off the + // stack, merge it with `new_cv`, and overwrite `new_cv` with the result. + // After all these merges, push the final value of `new_cv` onto the stack. + // The number of completed subtrees is given by the number of trailing 0-bits + // in the new total number of chunks. + while ((total_chunks & 1) == 0) { + parent_cv(hasher_pop_stack(self), new_cv, self->key_words, self->flags, + new_cv); + total_chunks >>= 1; + } + hasher_push_stack(self, new_cv); +} + +// Add input to the hash state. This can be called any number of times. +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + const uint8_t *input_u8 = (const uint8_t *)input; + while (input_len > 0) { + // If the current chunk is complete, finalize it and reset the chunk state. + // More input is coming, so this chunk is not ROOT. + if (chunk_state_len(&self->chunk_state) == BLAKE3_CHUNK_LEN) { + output chunk_output = chunk_state_output(&self->chunk_state); + uint32_t chunk_cv[8]; + output_chaining_value(&chunk_output, chunk_cv); + uint64_t total_chunks = self->chunk_state.chunk_counter + 1; + hasher_add_chunk_cv(self, chunk_cv, total_chunks); + chunk_state_init(&self->chunk_state, self->key_words, total_chunks, + self->flags); + } + + // Compress input bytes into the current chunk state. + size_t want = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk_state); + size_t take = want; + if (input_len < want) { + take = input_len; + } + chunk_state_update(&self->chunk_state, input_u8, take); + input_u8 += take; + input_len -= take; + } +} + +// Finalize the hash and write any number of output bytes. +void blake3_hasher_finalize(const blake3_hasher *self, void *out, + size_t out_len) { + // Starting with the output from the current chunk, compute all the parent + // chaining values along the right edge of the tree, until we have the root + // output. + output current_output = chunk_state_output(&self->chunk_state); + size_t parent_nodes_remaining = (size_t)self->cv_stack_len; + while (parent_nodes_remaining > 0) { + parent_nodes_remaining--; + uint32_t current_cv[8]; + output_chaining_value(¤t_output, current_cv); + current_output = parent_output(&self->cv_stack[parent_nodes_remaining * 8], + current_cv, self->key_words, self->flags); + } + output_root_bytes(¤t_output, out, out_len); +} diff --git a/libsql-ffi/bundled/sqlean/crypto/blake3_reference_impl.h b/libsql-ffi/bundled/sqlean/crypto/blake3_reference_impl.h new file mode 100644 index 0000000000..027301fd6d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/blake3_reference_impl.h @@ -0,0 +1,43 @@ +// Originally from blake3 reference implementation, Public Domain +// https://github.com/oconnor663/blake3_reference_impl_c + +#ifndef _BLAKE3_REFERENCE_IMPL_H +#define _BLAKE3_REFERENCE_IMPL_H + +#include +#include + +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 + +// This struct is private. +typedef struct _blake3_chunk_state { + uint32_t chaining_value[8]; + uint64_t chunk_counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t blocks_compressed; + uint32_t flags; +} _blake3_chunk_state; + +// An incremental hasher that can accept any number of writes. +typedef struct blake3_hasher { + _blake3_chunk_state chunk_state; + uint32_t key_words[8]; + uint32_t cv_stack[8 * 54]; // Space for 54 subtree chaining values: + uint8_t cv_stack_len; // 2^54 * CHUNK_LEN = 2^64 + uint32_t flags; +} blake3_hasher; + +void blake3_hasher_init(blake3_hasher *self); +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +void blake3_hasher_finalize(const blake3_hasher *self, void *out, + size_t out_len); + +#endif // _BLAKE3_REFERENCE_IMPL_H diff --git a/libsql-ffi/bundled/sqlean/crypto/extension.c b/libsql-ffi/bundled/sqlean/crypto/extension.c new file mode 100644 index 0000000000..c076b2f7b2 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/extension.c @@ -0,0 +1,225 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite hash and encode/decode functions. + +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "crypto/base32.h" +#include "crypto/base64.h" +#include "crypto/base85.h" +#include "crypto/blake3.h" +#include "crypto/hex.h" +#include "crypto/md5.h" +#include "crypto/sha1.h" +#include "crypto/sha2.h" +#include "crypto/url.h" + +// encoder/decoder function +typedef uint8_t* (*encdec_fn)(const uint8_t* src, size_t len, size_t* out_len); + +// Generic compute hash function. Algorithm is encoded in the user data field. +static void crypto_hash(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + return; + } + + void* (*init_func)() = NULL; + void (*update_func)(void*, void*, size_t) = NULL; + int (*final_func)(void*, void*) = NULL; + int algo = (intptr_t)sqlite3_user_data(context); + + switch (algo) { + case 1: /* Hardened SHA1 */ + init_func = (void*)sha1_init; + update_func = (void*)sha1_update; + final_func = (void*)sha1_final; + algo = 1; + break; + case 3: /* Blake3 */ + init_func = (void*)blake3_init; + update_func = (void*)blake3_update; + final_func = (void*)blake3_final; + algo = 3; + break; + case 5: /* MD5 */ + init_func = (void*)md5_init; + update_func = (void*)md5_update; + final_func = (void*)md5_final; + algo = 1; + break; + case 2256: /* SHA2-256 */ + init_func = (void*)sha256_init; + update_func = (void*)sha256_update; + final_func = (void*)sha256_final; + algo = 1; + break; + case 2384: /* SHA2-384 */ + init_func = (void*)sha384_init; + update_func = (void*)sha384_update; + final_func = (void*)sha384_final; + algo = 1; + break; + case 2512: /* SHA2-512 */ + init_func = (void*)sha512_init; + update_func = (void*)sha512_update; + final_func = (void*)sha512_final; + algo = 1; + break; + default: + sqlite3_result_error(context, "unknown algorithm", -1); + return; + } + + void* ctx = NULL; + if (algo) { + ctx = init_func(); + } + if (!ctx) { + sqlite3_result_error(context, "could not allocate algorithm context", -1); + return; + } + + void* data = NULL; + if (sqlite3_value_type(argv[0]) == SQLITE_BLOB) { + data = (void*)sqlite3_value_blob(argv[0]); + } else { + data = (void*)sqlite3_value_text(argv[0]); + } + + size_t datalen = sqlite3_value_bytes(argv[0]); + if (datalen > 0) { + update_func(ctx, data, datalen); + } + + unsigned char hash[128] = {0}; + int hashlen = final_func(ctx, hash); + sqlite3_result_blob(context, hash, hashlen, SQLITE_TRANSIENT); +} + +// Encodes binary data into a textual representation using the specified encoder. +static void encode(sqlite3_context* context, int argc, sqlite3_value** argv, encdec_fn encode_fn) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + size_t source_len = sqlite3_value_bytes(argv[0]); + const uint8_t* source = (uint8_t*)sqlite3_value_blob(argv[0]); + size_t result_len = 0; + const char* result = (char*)encode_fn(source, source_len, &result_len); + sqlite3_result_text(context, result, -1, free); +} + +// Encodes binary data into a textual representation using the specified algorithm. +// encode('hello', 'base64') = 'aGVsbG8=' +static void crypto_encode(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + size_t n = sqlite3_value_bytes(argv[1]); + const char* format = (char*)sqlite3_value_text(argv[1]); + if (strncmp(format, "base32", n) == 0) { + encode(context, 1, argv, base32_encode); + return; + } + if (strncmp(format, "base64", n) == 0) { + encode(context, 1, argv, base64_encode); + return; + } + if (strncmp(format, "base85", n) == 0) { + encode(context, 1, argv, base85_encode); + return; + } + if (strncmp(format, "hex", n) == 0) { + encode(context, 1, argv, hex_encode); + return; + } + if (strncmp(format, "url", n) == 0) { + encode(context, 1, argv, url_encode); + return; + } + sqlite3_result_error(context, "unknown encoding", -1); +} + +// Decodes binary data from a textual representation using the specified decoder. +static void decode(sqlite3_context* context, int argc, sqlite3_value** argv, encdec_fn decode_fn) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + + size_t source_len = sqlite3_value_bytes(argv[0]); + const uint8_t* source = (uint8_t*)sqlite3_value_text(argv[0]); + if (source_len == 0) { + sqlite3_result_zeroblob(context, 0); + return; + } + + size_t result_len = 0; + const uint8_t* result = decode_fn(source, source_len, &result_len); + if (result == NULL) { + sqlite3_result_error(context, "invalid input string", -1); + return; + } + + sqlite3_result_blob(context, result, result_len, free); +} + +// Decodes binary data from a textual representation using the specified algorithm. +// decode('aGVsbG8=', 'base64') = cast('hello' as blob) +static void crypto_decode(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + size_t n = sqlite3_value_bytes(argv[1]); + const char* format = (char*)sqlite3_value_text(argv[1]); + if (strncmp(format, "base32", n) == 0) { + decode(context, 1, argv, base32_decode); + return; + } + if (strncmp(format, "base64", n) == 0) { + decode(context, 1, argv, base64_decode); + return; + } + if (strncmp(format, "base85", n) == 0) { + decode(context, 1, argv, base85_decode); + return; + } + if (strncmp(format, "hex", n) == 0) { + decode(context, 1, argv, hex_decode); + return; + } + if (strncmp(format, "url", n) == 0) { + decode(context, 1, argv, url_decode); + return; + } + sqlite3_result_error(context, "unknown encoding", -1); +} + +int crypto_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "crypto_blake3", 1, flags, (void*)3, crypto_hash, 0, 0); + sqlite3_create_function(db, "blake3", 1, flags, (void*)3, crypto_hash, 0, 0); + sqlite3_create_function(db, "crypto_md5", 1, flags, (void*)5, crypto_hash, 0, 0); + sqlite3_create_function(db, "md5", 1, flags, (void*)5, crypto_hash, 0, 0); + sqlite3_create_function(db, "crypto_sha1", 1, flags, (void*)1, crypto_hash, 0, 0); + sqlite3_create_function(db, "sha1", 1, flags, (void*)1, crypto_hash, 0, 0); + sqlite3_create_function(db, "crypto_sha256", 1, flags, (void*)2256, crypto_hash, 0, 0); + sqlite3_create_function(db, "sha256", 1, flags, (void*)2256, crypto_hash, 0, 0); + sqlite3_create_function(db, "crypto_sha384", 1, flags, (void*)2384, crypto_hash, 0, 0); + sqlite3_create_function(db, "sha384", 1, flags, (void*)2384, crypto_hash, 0, 0); + sqlite3_create_function(db, "crypto_sha512", 1, flags, (void*)2512, crypto_hash, 0, 0); + sqlite3_create_function(db, "sha512", 1, flags, (void*)2512, crypto_hash, 0, 0); + + sqlite3_create_function(db, "crypto_encode", 2, flags, 0, crypto_encode, 0, 0); + sqlite3_create_function(db, "encode", 2, flags, 0, crypto_encode, 0, 0); + sqlite3_create_function(db, "crypto_decode", 2, flags, 0, crypto_decode, 0, 0); + sqlite3_create_function(db, "decode", 2, flags, 0, crypto_decode, 0, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/extension.h b/libsql-ffi/bundled/sqlean/crypto/extension.h new file mode 100644 index 0000000000..494bdfe48e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite hash and encode/decode functions. + +#ifndef CRYPTO_EXTENSION_H +#define CRYPTO_EXTENSION_H + +#include "sqlite3ext.h" + +int crypto_init(sqlite3* db); + +#endif /* CRYPTO_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/crypto/hex.c b/libsql-ffi/bundled/sqlean/crypto/hex.c new file mode 100644 index 0000000000..4c87b97c13 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/hex.c @@ -0,0 +1,72 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Hex encoding/decoding + +#include +#include +#include +#include + +uint8_t* hex_encode(const uint8_t* src, size_t len, size_t* out_len) { + *out_len = len * 2; + uint8_t* encoded = malloc(*out_len + 1); + if (encoded == NULL) { + *out_len = 0; + return NULL; + } + for (size_t i = 0; i < len; i++) { + sprintf((char*)encoded + (i * 2), "%02x", src[i]); + } + encoded[*out_len] = '\0'; + *out_len = len * 2; + return encoded; +} + +uint8_t* hex_decode(const uint8_t* src, size_t len, size_t* out_len) { + if (len % 2 != 0) { + // input length must be even + return NULL; + } + + size_t decoded_len = len / 2; + uint8_t* decoded = malloc(decoded_len); + if (decoded == NULL) { + *out_len = 0; + return NULL; + } + + for (size_t i = 0; i < decoded_len; i++) { + uint8_t hi = src[i * 2]; + uint8_t lo = src[i * 2 + 1]; + + if (hi >= '0' && hi <= '9') { + hi -= '0'; + } else if (hi >= 'A' && hi <= 'F') { + hi -= 'A' - 10; + } else if (hi >= 'a' && hi <= 'f') { + hi -= 'a' - 10; + } else { + // invalid character + free(decoded); + return NULL; + } + + if (lo >= '0' && lo <= '9') { + lo -= '0'; + } else if (lo >= 'A' && lo <= 'F') { + lo -= 'A' - 10; + } else if (lo >= 'a' && lo <= 'f') { + lo -= 'a' - 10; + } else { + // invalid character + free(decoded); + return NULL; + } + + decoded[i] = (hi << 4) | lo; + } + + *out_len = decoded_len; + return decoded; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/hex.h b/libsql-ffi/bundled/sqlean/crypto/hex.h new file mode 100644 index 0000000000..c1ea1c0808 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/hex.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Hex encoding/decoding + +#ifndef _HEX_H_ +#define _HEX_H_ + +#include +#include + +uint8_t* hex_encode(const uint8_t* src, size_t len, size_t* out_len); +uint8_t* hex_decode(const uint8_t* src, size_t len, size_t* out_len); + +#endif /* _HEX_H_ */ diff --git a/libsql-ffi/bundled/sqlean/crypto/md5.c b/libsql-ffi/bundled/sqlean/crypto/md5.c new file mode 100644 index 0000000000..4ae8ebf6bb --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/md5.c @@ -0,0 +1,201 @@ +/********************************************************************* + * Filename: md5.c + * Author: Brad Conte (brad AT bradconte.com) + * Source: https://github.com/B-Con/crypto-algorithms + * License: Public Domain + * Details: Implementation of the MD5 hashing algorithm. + * Algorithm specification can be found here: + * http://tools.ietf.org/html/rfc1321 + * This implementation uses little endian byte order. + *********************************************************************/ + +/*************************** HEADER FILES ***************************/ +#include +#include + +#include "crypto/md5.h" +/****************************** MACROS ******************************/ +#define ROTLEFT(a, b) ((a << b) | (a >> (32 - b))) + +#define F(x, y, z) ((x & y) | (~x & z)) +#define G(x, y, z) ((x & z) | (y & ~z)) +#define H(x, y, z) (x ^ y ^ z) +#define I(x, y, z) (y ^ (x | ~z)) + +#define FF(a, b, c, d, m, s, t) \ + { \ + a += F(b, c, d) + m + t; \ + a = b + ROTLEFT(a, s); \ + } +#define GG(a, b, c, d, m, s, t) \ + { \ + a += G(b, c, d) + m + t; \ + a = b + ROTLEFT(a, s); \ + } +#define HH(a, b, c, d, m, s, t) \ + { \ + a += H(b, c, d) + m + t; \ + a = b + ROTLEFT(a, s); \ + } +#define II(a, b, c, d, m, s, t) \ + { \ + a += I(b, c, d) + m + t; \ + a = b + ROTLEFT(a, s); \ + } + +/*********************** FUNCTION DEFINITIONS ***********************/ +static void md5_transform(MD5_CTX* ctx, const BYTE data[]) { + WORD a, b, c, d, m[16], i, j; + + // MD5 specifies big endian byte order, but this implementation assumes a little + // endian byte order CPU. Reverse all the bytes upon input, and re-reverse them + // on output (in md5_final()). + for (i = 0, j = 0; i < 16; ++i, j += 4) + m[i] = (data[j]) + (data[j + 1] << 8) + (data[j + 2] << 16) + ((WORD)data[j + 3] << 24); + + a = ctx->state[0]; + b = ctx->state[1]; + c = ctx->state[2]; + d = ctx->state[3]; + + FF(a, b, c, d, m[0], 7, 0xd76aa478); + FF(d, a, b, c, m[1], 12, 0xe8c7b756); + FF(c, d, a, b, m[2], 17, 0x242070db); + FF(b, c, d, a, m[3], 22, 0xc1bdceee); + FF(a, b, c, d, m[4], 7, 0xf57c0faf); + FF(d, a, b, c, m[5], 12, 0x4787c62a); + FF(c, d, a, b, m[6], 17, 0xa8304613); + FF(b, c, d, a, m[7], 22, 0xfd469501); + FF(a, b, c, d, m[8], 7, 0x698098d8); + FF(d, a, b, c, m[9], 12, 0x8b44f7af); + FF(c, d, a, b, m[10], 17, 0xffff5bb1); + FF(b, c, d, a, m[11], 22, 0x895cd7be); + FF(a, b, c, d, m[12], 7, 0x6b901122); + FF(d, a, b, c, m[13], 12, 0xfd987193); + FF(c, d, a, b, m[14], 17, 0xa679438e); + FF(b, c, d, a, m[15], 22, 0x49b40821); + + GG(a, b, c, d, m[1], 5, 0xf61e2562); + GG(d, a, b, c, m[6], 9, 0xc040b340); + GG(c, d, a, b, m[11], 14, 0x265e5a51); + GG(b, c, d, a, m[0], 20, 0xe9b6c7aa); + GG(a, b, c, d, m[5], 5, 0xd62f105d); + GG(d, a, b, c, m[10], 9, 0x02441453); + GG(c, d, a, b, m[15], 14, 0xd8a1e681); + GG(b, c, d, a, m[4], 20, 0xe7d3fbc8); + GG(a, b, c, d, m[9], 5, 0x21e1cde6); + GG(d, a, b, c, m[14], 9, 0xc33707d6); + GG(c, d, a, b, m[3], 14, 0xf4d50d87); + GG(b, c, d, a, m[8], 20, 0x455a14ed); + GG(a, b, c, d, m[13], 5, 0xa9e3e905); + GG(d, a, b, c, m[2], 9, 0xfcefa3f8); + GG(c, d, a, b, m[7], 14, 0x676f02d9); + GG(b, c, d, a, m[12], 20, 0x8d2a4c8a); + + HH(a, b, c, d, m[5], 4, 0xfffa3942); + HH(d, a, b, c, m[8], 11, 0x8771f681); + HH(c, d, a, b, m[11], 16, 0x6d9d6122); + HH(b, c, d, a, m[14], 23, 0xfde5380c); + HH(a, b, c, d, m[1], 4, 0xa4beea44); + HH(d, a, b, c, m[4], 11, 0x4bdecfa9); + HH(c, d, a, b, m[7], 16, 0xf6bb4b60); + HH(b, c, d, a, m[10], 23, 0xbebfbc70); + HH(a, b, c, d, m[13], 4, 0x289b7ec6); + HH(d, a, b, c, m[0], 11, 0xeaa127fa); + HH(c, d, a, b, m[3], 16, 0xd4ef3085); + HH(b, c, d, a, m[6], 23, 0x04881d05); + HH(a, b, c, d, m[9], 4, 0xd9d4d039); + HH(d, a, b, c, m[12], 11, 0xe6db99e5); + HH(c, d, a, b, m[15], 16, 0x1fa27cf8); + HH(b, c, d, a, m[2], 23, 0xc4ac5665); + + II(a, b, c, d, m[0], 6, 0xf4292244); + II(d, a, b, c, m[7], 10, 0x432aff97); + II(c, d, a, b, m[14], 15, 0xab9423a7); + II(b, c, d, a, m[5], 21, 0xfc93a039); + II(a, b, c, d, m[12], 6, 0x655b59c3); + II(d, a, b, c, m[3], 10, 0x8f0ccc92); + II(c, d, a, b, m[10], 15, 0xffeff47d); + II(b, c, d, a, m[1], 21, 0x85845dd1); + II(a, b, c, d, m[8], 6, 0x6fa87e4f); + II(d, a, b, c, m[15], 10, 0xfe2ce6e0); + II(c, d, a, b, m[6], 15, 0xa3014314); + II(b, c, d, a, m[13], 21, 0x4e0811a1); + II(a, b, c, d, m[4], 6, 0xf7537e82); + II(d, a, b, c, m[11], 10, 0xbd3af235); + II(c, d, a, b, m[2], 15, 0x2ad7d2bb); + II(b, c, d, a, m[9], 21, 0xeb86d391); + + ctx->state[0] += a; + ctx->state[1] += b; + ctx->state[2] += c; + ctx->state[3] += d; +} + +void* md5_init() { + MD5_CTX* ctx; + ctx = malloc(sizeof(MD5_CTX)); + ctx->datalen = 0; + ctx->bitlen = 0; + ctx->state[0] = 0x67452301; + ctx->state[1] = 0xEFCDAB89; + ctx->state[2] = 0x98BADCFE; + ctx->state[3] = 0x10325476; + return ctx; +} + +void md5_update(MD5_CTX* ctx, const BYTE data[], size_t len) { + size_t i; + + for (i = 0; i < len; ++i) { + ctx->data[ctx->datalen] = data[i]; + ctx->datalen++; + if (ctx->datalen == 64) { + md5_transform(ctx, ctx->data); + ctx->bitlen += 512; + ctx->datalen = 0; + } + } +} + +int md5_final(MD5_CTX* ctx, BYTE hash[]) { + size_t i; + + i = ctx->datalen; + + // Pad whatever data is left in the buffer. + if (ctx->datalen < 56) { + ctx->data[i++] = 0x80; + while (i < 56) + ctx->data[i++] = 0x00; + } else if (ctx->datalen >= 56) { + ctx->data[i++] = 0x80; + while (i < 64) + ctx->data[i++] = 0x00; + md5_transform(ctx, ctx->data); + memset(ctx->data, 0, 56); + } + + // Append to the padding the total message's length in bits and transform. + ctx->bitlen += ctx->datalen * 8; + ctx->data[56] = ctx->bitlen; + ctx->data[57] = ctx->bitlen >> 8; + ctx->data[58] = ctx->bitlen >> 16; + ctx->data[59] = ctx->bitlen >> 24; + ctx->data[60] = ctx->bitlen >> 32; + ctx->data[61] = ctx->bitlen >> 40; + ctx->data[62] = ctx->bitlen >> 48; + ctx->data[63] = ctx->bitlen >> 56; + md5_transform(ctx, ctx->data); + + // Since this implementation uses little endian byte ordering and MD uses big endian, + // reverse all the bytes when copying the final state to the output hash. + for (i = 0; i < 4; ++i) { + hash[i] = (ctx->state[0] >> (i * 8)) & 0x000000ff; + hash[i + 4] = (ctx->state[1] >> (i * 8)) & 0x000000ff; + hash[i + 8] = (ctx->state[2] >> (i * 8)) & 0x000000ff; + hash[i + 12] = (ctx->state[3] >> (i * 8)) & 0x000000ff; + } + free(ctx); + return MD5_BLOCK_SIZE; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/md5.h b/libsql-ffi/bundled/sqlean/crypto/md5.h new file mode 100644 index 0000000000..001ad12a94 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/md5.h @@ -0,0 +1,34 @@ +/********************************************************************* + * Filename: md5.h + * Author: Brad Conte (brad AT bradconte.com) + * Source: https://github.com/B-Con/crypto-algorithms + * License: Public Domain + * Details: Defines the API for the corresponding MD5 implementation. + *********************************************************************/ + +#ifndef MD5_H +#define MD5_H + +/*************************** HEADER FILES ***************************/ +#include + +/****************************** MACROS ******************************/ +#define MD5_BLOCK_SIZE 16 // MD5 outputs a 16 byte digest + +/**************************** DATA TYPES ****************************/ +typedef unsigned char BYTE; // 8-bit byte +typedef unsigned int WORD; // 32-bit word, change to "long" for 16-bit machines + +typedef struct { + BYTE data[64]; + WORD datalen; + unsigned long long bitlen; + WORD state[4]; +} MD5_CTX; + +/*********************** FUNCTION DECLARATIONS **********************/ +void* md5_init(); +void md5_update(MD5_CTX* ctx, const BYTE data[], size_t len); +int md5_final(MD5_CTX* ctx, BYTE hash[]); + +#endif // MD5_H diff --git a/libsql-ffi/bundled/sqlean/crypto/sha1.c b/libsql-ffi/bundled/sqlean/crypto/sha1.c new file mode 100644 index 0000000000..fe0af8762c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/sha1.c @@ -0,0 +1,241 @@ +// Originally from the sha1 SQLite exension, Public Domain +// https://sqlite.org/src/file/ext/misc/sha1.c +// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License + +#include +#include +#include +#include + +#include "crypto/sha1.h" + +#define SHA_ROT(x, l, r) ((x) << (l) | (x) >> (r)) +#define rol(x, k) SHA_ROT(x, k, 32 - (k)) +#define ror(x, k) SHA_ROT(x, 32 - (k), k) + +#define blk0le(i) (block[i] = (ror(block[i], 8) & 0xFF00FF00) | (rol(block[i], 8) & 0x00FF00FF)) +#define blk0be(i) block[i] +#define blk(i) \ + (block[i & 15] = \ + rol(block[(i + 13) & 15] ^ block[(i + 8) & 15] ^ block[(i + 2) & 15] ^ block[i & 15], 1)) + +/* + * (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1 + * + * Rl0() for little-endian and Rb0() for big-endian. Endianness is + * determined at run-time. + */ +#define Rl0(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk0le(i) + 0x5A827999 + rol(v, 5); \ + w = ror(w, 2); +#define Rb0(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk0be(i) + 0x5A827999 + rol(v, 5); \ + w = ror(w, 2); +#define R1(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \ + w = ror(w, 2); +#define R2(v, w, x, y, z, i) \ + z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \ + w = ror(w, 2); +#define R3(v, w, x, y, z, i) \ + z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \ + w = ror(w, 2); +#define R4(v, w, x, y, z, i) \ + z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \ + w = ror(w, 2); + +/* + * Hash a single 512-bit block. This is the core of the algorithm. + */ +void SHA1Transform(unsigned int state[5], const unsigned char buffer[64]) { + unsigned int qq[5]; /* a, b, c, d, e; */ + static int one = 1; + unsigned int block[16]; + memcpy(block, buffer, 64); + memcpy(qq, state, 5 * sizeof(unsigned int)); + +#define a qq[0] +#define b qq[1] +#define c qq[2] +#define d qq[3] +#define e qq[4] + + /* Copy ctx->state[] to working vars */ + /* + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + */ + + /* 4 rounds of 20 operations each. Loop unrolled. */ + if (1 == *(unsigned char*)&one) { + Rl0(a, b, c, d, e, 0); + Rl0(e, a, b, c, d, 1); + Rl0(d, e, a, b, c, 2); + Rl0(c, d, e, a, b, 3); + Rl0(b, c, d, e, a, 4); + Rl0(a, b, c, d, e, 5); + Rl0(e, a, b, c, d, 6); + Rl0(d, e, a, b, c, 7); + Rl0(c, d, e, a, b, 8); + Rl0(b, c, d, e, a, 9); + Rl0(a, b, c, d, e, 10); + Rl0(e, a, b, c, d, 11); + Rl0(d, e, a, b, c, 12); + Rl0(c, d, e, a, b, 13); + Rl0(b, c, d, e, a, 14); + Rl0(a, b, c, d, e, 15); + } else { + Rb0(a, b, c, d, e, 0); + Rb0(e, a, b, c, d, 1); + Rb0(d, e, a, b, c, 2); + Rb0(c, d, e, a, b, 3); + Rb0(b, c, d, e, a, 4); + Rb0(a, b, c, d, e, 5); + Rb0(e, a, b, c, d, 6); + Rb0(d, e, a, b, c, 7); + Rb0(c, d, e, a, b, 8); + Rb0(b, c, d, e, a, 9); + Rb0(a, b, c, d, e, 10); + Rb0(e, a, b, c, d, 11); + Rb0(d, e, a, b, c, 12); + Rb0(c, d, e, a, b, 13); + Rb0(b, c, d, e, a, 14); + Rb0(a, b, c, d, e, 15); + } + R1(e, a, b, c, d, 16); + R1(d, e, a, b, c, 17); + R1(c, d, e, a, b, 18); + R1(b, c, d, e, a, 19); + R2(a, b, c, d, e, 20); + R2(e, a, b, c, d, 21); + R2(d, e, a, b, c, 22); + R2(c, d, e, a, b, 23); + R2(b, c, d, e, a, 24); + R2(a, b, c, d, e, 25); + R2(e, a, b, c, d, 26); + R2(d, e, a, b, c, 27); + R2(c, d, e, a, b, 28); + R2(b, c, d, e, a, 29); + R2(a, b, c, d, e, 30); + R2(e, a, b, c, d, 31); + R2(d, e, a, b, c, 32); + R2(c, d, e, a, b, 33); + R2(b, c, d, e, a, 34); + R2(a, b, c, d, e, 35); + R2(e, a, b, c, d, 36); + R2(d, e, a, b, c, 37); + R2(c, d, e, a, b, 38); + R2(b, c, d, e, a, 39); + R3(a, b, c, d, e, 40); + R3(e, a, b, c, d, 41); + R3(d, e, a, b, c, 42); + R3(c, d, e, a, b, 43); + R3(b, c, d, e, a, 44); + R3(a, b, c, d, e, 45); + R3(e, a, b, c, d, 46); + R3(d, e, a, b, c, 47); + R3(c, d, e, a, b, 48); + R3(b, c, d, e, a, 49); + R3(a, b, c, d, e, 50); + R3(e, a, b, c, d, 51); + R3(d, e, a, b, c, 52); + R3(c, d, e, a, b, 53); + R3(b, c, d, e, a, 54); + R3(a, b, c, d, e, 55); + R3(e, a, b, c, d, 56); + R3(d, e, a, b, c, 57); + R3(c, d, e, a, b, 58); + R3(b, c, d, e, a, 59); + R4(a, b, c, d, e, 60); + R4(e, a, b, c, d, 61); + R4(d, e, a, b, c, 62); + R4(c, d, e, a, b, 63); + R4(b, c, d, e, a, 64); + R4(a, b, c, d, e, 65); + R4(e, a, b, c, d, 66); + R4(d, e, a, b, c, 67); + R4(c, d, e, a, b, 68); + R4(b, c, d, e, a, 69); + R4(a, b, c, d, e, 70); + R4(e, a, b, c, d, 71); + R4(d, e, a, b, c, 72); + R4(c, d, e, a, b, 73); + R4(b, c, d, e, a, 74); + R4(a, b, c, d, e, 75); + R4(e, a, b, c, d, 76); + R4(d, e, a, b, c, 77); + R4(c, d, e, a, b, 78); + R4(b, c, d, e, a, 79); + + /* Add the working vars back into context.state[] */ + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + +#undef a +#undef b +#undef c +#undef d +#undef e +} + +/* Initialize a SHA1 context */ +void* sha1_init() { + /* SHA1 initialization constants */ + SHA1Context* ctx; + ctx = malloc(sizeof(SHA1Context)); + ctx->state[0] = 0x67452301; + ctx->state[1] = 0xEFCDAB89; + ctx->state[2] = 0x98BADCFE; + ctx->state[3] = 0x10325476; + ctx->state[4] = 0xC3D2E1F0; + ctx->count[0] = ctx->count[1] = 0; + return ctx; +} + +/* Add new content to the SHA1 hash */ +void sha1_update(SHA1Context* ctx, const unsigned char* data, size_t len) { + unsigned int i, j; + + j = ctx->count[0]; + if ((ctx->count[0] += len << 3) < j) { + ctx->count[1] += (len >> 29) + 1; + } + j = (j >> 3) & 63; + if ((j + len) > 63) { + (void)memcpy(&ctx->buffer[j], data, (i = 64 - j)); + SHA1Transform(ctx->state, ctx->buffer); + for (; i + 63 < len; i += 64) { + SHA1Transform(ctx->state, &data[i]); + } + j = 0; + } else { + i = 0; + } + (void)memcpy(&ctx->buffer[j], &data[i], len - i); +} + +int sha1_final(SHA1Context* ctx, unsigned char hash[]) { + unsigned int i; + unsigned char finalcount[8]; + + for (i = 0; i < 8; i++) { + finalcount[i] = (unsigned char)((ctx->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & + 255); /* Endian independent */ + } + sha1_update(ctx, (const unsigned char*)"\200", 1); + while ((ctx->count[0] & 504) != 448) { + sha1_update(ctx, (const unsigned char*)"\0", 1); + } + sha1_update(ctx, finalcount, 8); /* Should cause a SHA1Transform() */ + for (i = 0; i < 20; i++) { + hash[i] = (unsigned char)((ctx->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255); + } + free(ctx); + return SHA1_BLOCK_SIZE; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/sha1.h b/libsql-ffi/bundled/sqlean/crypto/sha1.h new file mode 100644 index 0000000000..54990f96b5 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/sha1.h @@ -0,0 +1,21 @@ +// Adapted from https://sqlite.org/src/file/ext/misc/sha1.c +// Public domain + +#ifndef __SHA1_H__ +#define __SHA1_H__ + +#include + +#define SHA1_BLOCK_SIZE 20 + +typedef struct SHA1Context { + unsigned int state[5]; + unsigned int count[2]; + unsigned char buffer[64]; +} SHA1Context; + +void* sha1_init(); +void sha1_update(SHA1Context* ctx, const unsigned char data[], size_t len); +int sha1_final(SHA1Context* ctx, unsigned char hash[]); + +#endif diff --git a/libsql-ffi/bundled/sqlean/crypto/sha2.c b/libsql-ffi/bundled/sqlean/crypto/sha2.c new file mode 100644 index 0000000000..0f4dc0ad06 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/sha2.c @@ -0,0 +1,938 @@ +/* + * FILE: sha2.c + * AUTHOR: Aaron D. Gifford - http://www.aarongifford.com/ + * + * Copyright (c) 2000-2001, Aaron D. Gifford + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the copyright holder nor the names of contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTOR(S) ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTOR(S) BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: sha2.c,v 1.1 2001/11/08 00:01:51 adg Exp adg $ + */ + +#include /* assert() */ +#include +#include /* memcpy()/memset() or bcopy()/bzero() */ + +#include "crypto/sha2.h" + +/* + * ASSERT NOTE: + * Some sanity checking code is included using assert(). On my FreeBSD + * system, this additional code can be removed by compiling with NDEBUG + * defined. Check your own systems manpage on assert() to see how to + * compile WITHOUT the sanity checking code on your system. + * + * UNROLLED TRANSFORM LOOP NOTE: + * You can define SHA2_UNROLL_TRANSFORM to use the unrolled transform + * loop version for the hash transform rounds (defined using macros + * later in this file). Either define on the command line, for example: + * + * cc -DSHA2_UNROLL_TRANSFORM -o sha2 sha2.c sha2prog.c + * + * or define below: + * + * #define SHA2_UNROLL_TRANSFORM + * + */ + +/*** SHA-256/384/512 Machine Architecture Definitions *****************/ +/* + * BYTE_ORDER NOTE: + * + * Please make sure that your system defines BYTE_ORDER. If your + * architecture is little-endian, make sure it also defines + * LITTLE_ENDIAN and that the two (BYTE_ORDER and LITTLE_ENDIAN) are + * equivilent. + * + * If your system does not define the above, then you can do so by + * hand like this: + * + * #define LITTLE_ENDIAN 1234 + * #define BIG_ENDIAN 4321 + * + * And for little-endian machines, add: + * + * #define BYTE_ORDER LITTLE_ENDIAN + * + * Or for big-endian machines: + * + * #define BYTE_ORDER BIG_ENDIAN + * + * The FreeBSD machine this was written on defines BYTE_ORDER + * appropriately by including (which in turn includes + * where the appropriate definitions are actually + * made). + */ + +#ifdef __BYTE_ORDER__ +#ifndef BYTE_ORDER +#define BYTE_ORDER __BYTE_ORDER__ +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __ORDER_BIG_ENDIAN__ +#endif +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__ +#endif +#endif + +#ifndef BYTE_ORDER +#if defined(i386) || defined(__i386__) || defined(_M_IX86) || defined(__x86_64) || \ + defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || defined(_M_ARM) || \ + defined(__x86) || defined(__arm__) +#define BYTE_ORDER 1234 +#elif defined(sparc) || defined(__ppc__) +#define BYTE_ORDER 4321 +#else +#define BYTE_ORDER 0 +#endif +#endif + +#if !defined(BYTE_ORDER) || (BYTE_ORDER != LITTLE_ENDIAN && BYTE_ORDER != BIG_ENDIAN) +#error Define BYTE_ORDER to be equal to either LITTLE_ENDIAN or BIG_ENDIAN +#endif + +/* + * Define the followingsha2_* types to types of the correct length on + * the native archtecture. Most BSD systems and Linux define u_intXX_t + * types. Machines with very recent ANSI C headers, can use the + * uintXX_t definintions from inttypes.h by defining SHA2_USE_INTTYPES_H + * during compile or in the sha.h header file. + * + * Machines that support neither u_intXX_t nor inttypes.h's uintXX_t + * will need to define these three typedefs below (and the appropriate + * ones in sha.h too) by hand according to their system architecture. + * + * Thank you, Jun-ichiro itojun Hagino, for suggesting using u_intXX_t + * types and pointing out recent ANSI C support for uintXX_t in inttypes.h. + */ +#ifdef SHA2_USE_INTTYPES_H + +typedef uint8_t sha2_byte; /* Exactly 1 byte */ +typedef uint32_t sha2_word32; /* Exactly 4 bytes */ +typedef uint64_t sha2_word64; /* Exactly 8 bytes */ + +#else /* SHA2_USE_INTTYPES_H */ + +typedef u_int8_t sha2_byte; /* Exactly 1 byte */ +typedef u_int32_t sha2_word32; /* Exactly 4 bytes */ +typedef u_int64_t sha2_word64; /* Exactly 8 bytes */ + +#endif /* SHA2_USE_INTTYPES_H */ + +/*** SHA-256/384/512 Various Length Definitions ***********************/ +/* NOTE: Most of these are in sha2.h */ +#define SHA256_SHORT_BLOCK_LENGTH (SHA256_BLOCK_LENGTH - 8) +#define SHA384_SHORT_BLOCK_LENGTH (SHA384_BLOCK_LENGTH - 16) +#define SHA512_SHORT_BLOCK_LENGTH (SHA512_BLOCK_LENGTH - 16) + +/*** ENDIAN REVERSAL MACROS *******************************************/ +#if BYTE_ORDER == LITTLE_ENDIAN +#define REVERSE32(w, x) \ + { \ + sha2_word32 tmp = (w); \ + tmp = (tmp >> 16) | (tmp << 16); \ + (x) = ((tmp & 0xff00ff00UL) >> 8) | ((tmp & 0x00ff00ffUL) << 8); \ + } +#define REVERSE64(w, x) \ + { \ + sha2_word64 tmp = (w); \ + tmp = (tmp >> 32) | (tmp << 32); \ + tmp = ((tmp & 0xff00ff00ff00ff00ULL) >> 8) | ((tmp & 0x00ff00ff00ff00ffULL) << 8); \ + (x) = ((tmp & 0xffff0000ffff0000ULL) >> 16) | ((tmp & 0x0000ffff0000ffffULL) << 16); \ + } +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + +/* + * Macro for incrementally adding the unsigned 64-bit integer n to the + * unsigned 128-bit integer (represented using a two-element array of + * 64-bit words): + */ +#define ADDINC128(w, n) \ + { \ + (w)[0] += (sha2_word64)(n); \ + if ((w)[0] < (n)) { \ + (w)[1]++; \ + } \ + } + +/* + * Macros for copying blocks of memory and for zeroing out ranges + * of memory. Using these macros makes it easy to switch from + * using memset()/memcpy() and using bzero()/bcopy(). + * + * Please define either SHA2_USE_MEMSET_MEMCPY or define + * SHA2_USE_BZERO_BCOPY depending on which function set you + * choose to use: + */ +#if !defined(SHA2_USE_MEMSET_MEMCPY) && !defined(SHA2_USE_BZERO_BCOPY) +/* Default to memset()/memcpy() if no option is specified */ +#define SHA2_USE_MEMSET_MEMCPY 1 +#endif +#if defined(SHA2_USE_MEMSET_MEMCPY) && defined(SHA2_USE_BZERO_BCOPY) +/* Abort with an error if BOTH options are defined */ +#error Define either SHA2_USE_MEMSET_MEMCPY or SHA2_USE_BZERO_BCOPY, not both! +#endif + +#ifdef SHA2_USE_MEMSET_MEMCPY +#define MEMSET_BZERO(p, l) memset((p), 0, (l)) +#define MEMCPY_BCOPY(d, s, l) memcpy((d), (s), (l)) +#endif +#ifdef SHA2_USE_BZERO_BCOPY +#define MEMSET_BZERO(p, l) bzero((p), (l)) +#define MEMCPY_BCOPY(d, s, l) bcopy((s), (d), (l)) +#endif + +/*** THE SIX LOGICAL FUNCTIONS ****************************************/ +/* + * Bit shifting and rotation (used by the six SHA-XYZ logical functions: + * + * NOTE: The naming of R and S appears backwards here (R is a SHIFT and + * S is a ROTATION) because the SHA-256/384/512 description document + * (see http://csrc.nist.gov/cryptval/shs/sha256-384-512.pdf) uses this + * same "backwards" definition. + */ +/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ +#define R(b, x) ((x) >> (b)) +/* 32-bit Rotate-right (used in SHA-256): */ +#define S32(b, x) (((x) >> (b)) | ((x) << (32 - (b)))) +/* 64-bit Rotate-right (used in SHA-384 and SHA-512): */ +#define S64(b, x) (((x) >> (b)) | ((x) << (64 - (b)))) + +/* Two of six logical functions used in SHA-256, SHA-384, and SHA-512: */ +#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +/* Four of six logical functions used in SHA-256: */ +#define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) +#define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) +#define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3, (x))) +#define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) + +/* Four of six logical functions used in SHA-384 and SHA-512: */ +#define Sigma0_512(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x))) +#define Sigma1_512(x) (S64(14, (x)) ^ S64(18, (x)) ^ S64(41, (x))) +#define sigma0_512(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7, (x))) +#define sigma1_512(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x))) + +/*** INTERNAL FUNCTION PROTOTYPES *************************************/ +/* NOTE: These should not be accessed directly from outside this + * library -- they are intended for private internal visibility/use + * only. + */ +// void SHA512_Last(SHA512_CTX*); +// void SHA256_Transform(SHA256_CTX*, const sha2_word32*); +// void SHA512_Transform(SHA512_CTX*, const sha2_word64*); + +/*** SHA-XYZ INITIAL HASH VALUES AND CONSTANTS ************************/ +/* Hash constant words K for SHA-256: */ +const static sha2_word32 K256[64] = { + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, + 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, + 0x0fc19dc6UL, 0x240ca1ccUL, 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL, + 0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, 0xa2bfe8a1UL, 0xa81a664bUL, + 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, + 0x5b9cca4fUL, 0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL}; + +/* Initial hash value H for SHA-256: */ +const static sha2_word32 sha256_initial_hash_value[8] = {0x6a09e667UL, 0xbb67ae85UL, 0x3c6ef372UL, + 0xa54ff53aUL, 0x510e527fUL, 0x9b05688cUL, + 0x1f83d9abUL, 0x5be0cd19UL}; + +/* Hash constant words K for SHA-384 and SHA-512: */ +const static sha2_word64 K512[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL}; + +/* Initial hash value H for SHA-384 */ +const static sha2_word64 sha384_initial_hash_value[8] = { + 0xcbbb9d5dc1059ed8ULL, 0x629a292a367cd507ULL, 0x9159015a3070dd17ULL, 0x152fecd8f70e5939ULL, + 0x67332667ffc00b31ULL, 0x8eb44a8768581511ULL, 0xdb0c2e0d64f98fa7ULL, 0x47b5481dbefa4fa4ULL}; + +/* Initial hash value H for SHA-512 */ +const static sha2_word64 sha512_initial_hash_value[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL}; + +/*** SHA-256: *********************************************************/ +void* sha256_init() { + SHA256_CTX* context; + context = malloc(sizeof(SHA256_CTX)); + if (!context) + return NULL; + MEMCPY_BCOPY(context->state, sha256_initial_hash_value, SHA256_DIGEST_LENGTH); + MEMSET_BZERO(context->buffer, SHA256_BLOCK_LENGTH); + context->bitcount = 0; + return context; +} + +#ifdef SHA2_UNROLL_TRANSFORM + +/* Unrolled SHA-256 round macros: */ + +#if BYTE_ORDER == LITTLE_ENDIAN + +#define ROUND256_0_TO_15(a, b, c, d, e, f, g, h) \ + REVERSE32(*data++, W256[j]); \ + T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + W256[j]; \ + (d) += T1; \ + (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ + j++ + +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + +#define ROUND256_0_TO_15(a, b, c, d, e, f, g, h) \ + T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + (W256[j] = *data++); \ + (d) += T1; \ + (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ + j++ + +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + +#define ROUND256(a, b, c, d, e, f, g, h) \ + s0 = W256[(j + 1) & 0x0f]; \ + s0 = sigma0_256(s0); \ + s1 = W256[(j + 14) & 0x0f]; \ + s1 = sigma1_256(s1); \ + T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + \ + (W256[j & 0x0f] += s1 + W256[(j + 9) & 0x0f] + s0); \ + (d) += T1; \ + (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ + j++ + +static void SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) { + sha2_word32 a, b, c, d, e, f, g, h, s0, s1; + sha2_word32 T1, *W256; + int j; + + W256 = (sha2_word32*)context->buffer; + + /* Initialize registers with the prev. intermediate value */ + a = context->state[0]; + b = context->state[1]; + c = context->state[2]; + d = context->state[3]; + e = context->state[4]; + f = context->state[5]; + g = context->state[6]; + h = context->state[7]; + + j = 0; + do { + /* Rounds 0 to 15 (unrolled): */ + ROUND256_0_TO_15(a, b, c, d, e, f, g, h); + ROUND256_0_TO_15(h, a, b, c, d, e, f, g); + ROUND256_0_TO_15(g, h, a, b, c, d, e, f); + ROUND256_0_TO_15(f, g, h, a, b, c, d, e); + ROUND256_0_TO_15(e, f, g, h, a, b, c, d); + ROUND256_0_TO_15(d, e, f, g, h, a, b, c); + ROUND256_0_TO_15(c, d, e, f, g, h, a, b); + ROUND256_0_TO_15(b, c, d, e, f, g, h, a); + } while (j < 16); + + /* Now for the remaining rounds to 64: */ + do { + ROUND256(a, b, c, d, e, f, g, h); + ROUND256(h, a, b, c, d, e, f, g); + ROUND256(g, h, a, b, c, d, e, f); + ROUND256(f, g, h, a, b, c, d, e); + ROUND256(e, f, g, h, a, b, c, d); + ROUND256(d, e, f, g, h, a, b, c); + ROUND256(c, d, e, f, g, h, a, b); + ROUND256(b, c, d, e, f, g, h, a); + } while (j < 64); + + /* Compute the current intermediate hash value */ + context->state[0] += a; + context->state[1] += b; + context->state[2] += c; + context->state[3] += d; + context->state[4] += e; + context->state[5] += f; + context->state[6] += g; + context->state[7] += h; + + /* Clean up */ + a = b = c = d = e = f = g = h = T1 = 0; +} + +#else /* SHA2_UNROLL_TRANSFORM */ + +static void SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) { + sha2_word32 a, b, c, d, e, f, g, h, s0, s1; + sha2_word32 T1, T2, *W256; + int j; + + W256 = (sha2_word32*)context->buffer; + + /* Initialize registers with the prev. intermediate value */ + a = context->state[0]; + b = context->state[1]; + c = context->state[2]; + d = context->state[3]; + e = context->state[4]; + f = context->state[5]; + g = context->state[6]; + h = context->state[7]; + + j = 0; + do { +#if BYTE_ORDER == LITTLE_ENDIAN + /* Copy data while converting to host byte order */ + REVERSE32(*data++, W256[j]); + /* Apply the SHA-256 compression function to update a..h */ + T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + W256[j]; +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + /* Apply the SHA-256 compression function to update a..h with copy */ + T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + (W256[j] = *data++); +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + T2 = Sigma0_256(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + + j++; + } while (j < 16); + + do { + /* Part of the message block expansion: */ + s0 = W256[(j + 1) & 0x0f]; + s0 = sigma0_256(s0); + s1 = W256[(j + 14) & 0x0f]; + s1 = sigma1_256(s1); + + /* Apply the SHA-256 compression function to update a..h */ + T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + + (W256[j & 0x0f] += s1 + W256[(j + 9) & 0x0f] + s0); + T2 = Sigma0_256(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + + j++; + } while (j < 64); + + /* Compute the current intermediate hash value */ + context->state[0] += a; + context->state[1] += b; + context->state[2] += c; + context->state[3] += d; + context->state[4] += e; + context->state[5] += f; + context->state[6] += g; + context->state[7] += h; + + /* Clean up */ + a = b = c = d = e = f = g = h = T1 = T2 = 0; +} + +#endif /* SHA2_UNROLL_TRANSFORM */ + +void sha256_update(SHA256_CTX* context, const sha2_byte* data, size_t len) { + unsigned int freespace, usedspace; + + if (len == 0) { + /* Calling with no data is valid - we do nothing */ + return; + } + + /* Sanity check: */ + assert(context != (SHA256_CTX*)0 && data != (sha2_byte*)0); + + usedspace = (context->bitcount >> 3) % SHA256_BLOCK_LENGTH; + if (usedspace > 0) { + /* Calculate how much free space is available in the buffer */ + freespace = SHA256_BLOCK_LENGTH - usedspace; + + if (len >= freespace) { + /* Fill the buffer completely and process it */ + MEMCPY_BCOPY(&context->buffer[usedspace], data, freespace); + context->bitcount += freespace << 3; + len -= freespace; + data += freespace; + SHA256_Transform(context, (sha2_word32*)context->buffer); + } else { + /* The buffer is not yet full */ + MEMCPY_BCOPY(&context->buffer[usedspace], data, len); + context->bitcount += len << 3; + /* Clean up: */ + usedspace = freespace = 0; + return; + } + } + while (len >= SHA256_BLOCK_LENGTH) { + /* Process as many complete blocks as we can */ + SHA256_Transform(context, (sha2_word32*)data); + context->bitcount += SHA256_BLOCK_LENGTH << 3; + len -= SHA256_BLOCK_LENGTH; + data += SHA256_BLOCK_LENGTH; + } + if (len > 0) { + /* There's left-overs, so save 'em */ + MEMCPY_BCOPY(context->buffer, data, len); + context->bitcount += len << 3; + } + /* Clean up: */ + usedspace = freespace = 0; +} + +int sha256_final(SHA256_CTX* context, sha2_byte digest[SHA256_DIGEST_LENGTH]) { + sha2_word32* d = (sha2_word32*)digest; + unsigned int usedspace; + + /* Sanity check: */ + assert(context != (SHA256_CTX*)0); + + /* If no digest buffer is passed, we don't bother doing this: */ + if (digest != (sha2_byte*)0) { + usedspace = (context->bitcount >> 3) % SHA256_BLOCK_LENGTH; +#if BYTE_ORDER == LITTLE_ENDIAN + /* Convert FROM host byte order */ + REVERSE64(context->bitcount, context->bitcount); +#endif + if (usedspace > 0) { + /* Begin padding with a 1 bit: */ + context->buffer[usedspace++] = 0x80; + + if (usedspace <= SHA256_SHORT_BLOCK_LENGTH) { + /* Set-up for the last transform: */ + MEMSET_BZERO(&context->buffer[usedspace], SHA256_SHORT_BLOCK_LENGTH - usedspace); + } else { + if (usedspace < SHA256_BLOCK_LENGTH) { + MEMSET_BZERO(&context->buffer[usedspace], SHA256_BLOCK_LENGTH - usedspace); + } + /* Do second-to-last transform: */ + SHA256_Transform(context, (sha2_word32*)context->buffer); + + /* And set-up for the last transform: */ + MEMSET_BZERO(context->buffer, SHA256_SHORT_BLOCK_LENGTH); + } + } else { + /* Set-up for the last transform: */ + MEMSET_BZERO(context->buffer, SHA256_SHORT_BLOCK_LENGTH); + + /* Begin padding with a 1 bit: */ + *context->buffer = 0x80; + } + /* Set the bit count: */ + *(sha2_word64*)&context->buffer[SHA256_SHORT_BLOCK_LENGTH] = context->bitcount; + + /* Final transform: */ + SHA256_Transform(context, (sha2_word32*)context->buffer); + +#if BYTE_ORDER == LITTLE_ENDIAN + { + /* Convert TO host byte order */ + int j; + for (j = 0; j < 8; j++) { + REVERSE32(context->state[j], context->state[j]); + *d++ = context->state[j]; + } + } +#else + MEMCPY_BCOPY(d, context->state, SHA256_DIGEST_LENGTH); +#endif + } + + /* Clean up state data: */ + free(context); + usedspace = 0; + return SHA256_DIGEST_LENGTH; +} + +/*** SHA-512: *********************************************************/ +void* sha512_init() { + SHA512_CTX* context; + context = malloc(sizeof(SHA512_CTX)); + if (!context) + return NULL; + MEMCPY_BCOPY(context->state, sha512_initial_hash_value, SHA512_DIGEST_LENGTH); + MEMSET_BZERO(context->buffer, SHA512_BLOCK_LENGTH); + context->bitcount[0] = context->bitcount[1] = 0; + return context; +} + +#ifdef SHA2_UNROLL_TRANSFORM + +/* Unrolled SHA-512 round macros: */ +#if BYTE_ORDER == LITTLE_ENDIAN + +#define ROUND512_0_TO_15(a, b, c, d, e, f, g, h) \ + REVERSE64(*data++, W512[j]); \ + T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + W512[j]; \ + (d) += T1, (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)), j++ + +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + +#define ROUND512_0_TO_15(a, b, c, d, e, f, g, h) \ + T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + (W512[j] = *data++); \ + (d) += T1; \ + (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)); \ + j++ + +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + +#define ROUND512(a, b, c, d, e, f, g, h) \ + s0 = W512[(j + 1) & 0x0f]; \ + s0 = sigma0_512(s0); \ + s1 = W512[(j + 14) & 0x0f]; \ + s1 = sigma1_512(s1); \ + T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + \ + (W512[j & 0x0f] += s1 + W512[(j + 9) & 0x0f] + s0); \ + (d) += T1; \ + (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)); \ + j++ + +static void SHA512_Transform(SHA512_CTX* context, const sha2_word64* data) { + sha2_word64 a, b, c, d, e, f, g, h, s0, s1; + sha2_word64 T1, *W512 = (sha2_word64*)context->buffer; + int j; + + /* Initialize registers with the prev. intermediate value */ + a = context->state[0]; + b = context->state[1]; + c = context->state[2]; + d = context->state[3]; + e = context->state[4]; + f = context->state[5]; + g = context->state[6]; + h = context->state[7]; + + j = 0; + do { + ROUND512_0_TO_15(a, b, c, d, e, f, g, h); + ROUND512_0_TO_15(h, a, b, c, d, e, f, g); + ROUND512_0_TO_15(g, h, a, b, c, d, e, f); + ROUND512_0_TO_15(f, g, h, a, b, c, d, e); + ROUND512_0_TO_15(e, f, g, h, a, b, c, d); + ROUND512_0_TO_15(d, e, f, g, h, a, b, c); + ROUND512_0_TO_15(c, d, e, f, g, h, a, b); + ROUND512_0_TO_15(b, c, d, e, f, g, h, a); + } while (j < 16); + + /* Now for the remaining rounds up to 79: */ + do { + ROUND512(a, b, c, d, e, f, g, h); + ROUND512(h, a, b, c, d, e, f, g); + ROUND512(g, h, a, b, c, d, e, f); + ROUND512(f, g, h, a, b, c, d, e); + ROUND512(e, f, g, h, a, b, c, d); + ROUND512(d, e, f, g, h, a, b, c); + ROUND512(c, d, e, f, g, h, a, b); + ROUND512(b, c, d, e, f, g, h, a); + } while (j < 80); + + /* Compute the current intermediate hash value */ + context->state[0] += a; + context->state[1] += b; + context->state[2] += c; + context->state[3] += d; + context->state[4] += e; + context->state[5] += f; + context->state[6] += g; + context->state[7] += h; + + /* Clean up */ + a = b = c = d = e = f = g = h = T1 = 0; +} + +#else /* SHA2_UNROLL_TRANSFORM */ + +static void SHA512_Transform(SHA512_CTX* context, const sha2_word64* data) { + sha2_word64 a, b, c, d, e, f, g, h, s0, s1; + sha2_word64 T1, T2, *W512 = (sha2_word64*)context->buffer; + int j; + + /* Initialize registers with the prev. intermediate value */ + a = context->state[0]; + b = context->state[1]; + c = context->state[2]; + d = context->state[3]; + e = context->state[4]; + f = context->state[5]; + g = context->state[6]; + h = context->state[7]; + + j = 0; + do { +#if BYTE_ORDER == LITTLE_ENDIAN + /* Convert TO host byte order */ + REVERSE64(*data++, W512[j]); + /* Apply the SHA-512 compression function to update a..h */ + T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + W512[j]; +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + /* Apply the SHA-512 compression function to update a..h with copy */ + T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + (W512[j] = *data++); +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + T2 = Sigma0_512(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + + j++; + } while (j < 16); + + do { + /* Part of the message block expansion: */ + s0 = W512[(j + 1) & 0x0f]; + s0 = sigma0_512(s0); + s1 = W512[(j + 14) & 0x0f]; + s1 = sigma1_512(s1); + + /* Apply the SHA-512 compression function to update a..h */ + T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + + (W512[j & 0x0f] += s1 + W512[(j + 9) & 0x0f] + s0); + T2 = Sigma0_512(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + + j++; + } while (j < 80); + + /* Compute the current intermediate hash value */ + context->state[0] += a; + context->state[1] += b; + context->state[2] += c; + context->state[3] += d; + context->state[4] += e; + context->state[5] += f; + context->state[6] += g; + context->state[7] += h; + + /* Clean up */ + a = b = c = d = e = f = g = h = T1 = T2 = 0; +} + +#endif /* SHA2_UNROLL_TRANSFORM */ + +void sha512_update(SHA512_CTX* context, const sha2_byte* data, size_t len) { + unsigned int freespace, usedspace; + + if (len == 0) { + /* Calling with no data is valid - we do nothing */ + return; + } + + /* Sanity check: */ + assert(context != (SHA512_CTX*)0 && data != (sha2_byte*)0); + + usedspace = (context->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH; + if (usedspace > 0) { + /* Calculate how much free space is available in the buffer */ + freespace = SHA512_BLOCK_LENGTH - usedspace; + + if (len >= freespace) { + /* Fill the buffer completely and process it */ + MEMCPY_BCOPY(&context->buffer[usedspace], data, freespace); + ADDINC128(context->bitcount, freespace << 3); + len -= freespace; + data += freespace; + SHA512_Transform(context, (sha2_word64*)context->buffer); + } else { + /* The buffer is not yet full */ + MEMCPY_BCOPY(&context->buffer[usedspace], data, len); + ADDINC128(context->bitcount, len << 3); + /* Clean up: */ + usedspace = freespace = 0; + return; + } + } + while (len >= SHA512_BLOCK_LENGTH) { + /* Process as many complete blocks as we can */ + SHA512_Transform(context, (sha2_word64*)data); + ADDINC128(context->bitcount, SHA512_BLOCK_LENGTH << 3); + len -= SHA512_BLOCK_LENGTH; + data += SHA512_BLOCK_LENGTH; + } + if (len > 0) { + /* There's left-overs, so save 'em */ + MEMCPY_BCOPY(context->buffer, data, len); + ADDINC128(context->bitcount, len << 3); + } + /* Clean up: */ + usedspace = freespace = 0; +} + +static void SHA512_Last(SHA512_CTX* context) { + unsigned int usedspace; + + usedspace = (context->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH; +#if BYTE_ORDER == LITTLE_ENDIAN + /* Convert FROM host byte order */ + REVERSE64(context->bitcount[0], context->bitcount[0]); + REVERSE64(context->bitcount[1], context->bitcount[1]); +#endif + if (usedspace > 0) { + /* Begin padding with a 1 bit: */ + context->buffer[usedspace++] = 0x80; + + if (usedspace <= SHA512_SHORT_BLOCK_LENGTH) { + /* Set-up for the last transform: */ + MEMSET_BZERO(&context->buffer[usedspace], SHA512_SHORT_BLOCK_LENGTH - usedspace); + } else { + if (usedspace < SHA512_BLOCK_LENGTH) { + MEMSET_BZERO(&context->buffer[usedspace], SHA512_BLOCK_LENGTH - usedspace); + } + /* Do second-to-last transform: */ + SHA512_Transform(context, (sha2_word64*)context->buffer); + + /* And set-up for the last transform: */ + MEMSET_BZERO(context->buffer, SHA512_BLOCK_LENGTH - 2); + } + } else { + /* Prepare for final transform: */ + MEMSET_BZERO(context->buffer, SHA512_SHORT_BLOCK_LENGTH); + + /* Begin padding with a 1 bit: */ + *context->buffer = 0x80; + } + /* Store the length of input data (in bits): */ + *(sha2_word64*)&context->buffer[SHA512_SHORT_BLOCK_LENGTH] = context->bitcount[1]; + *(sha2_word64*)&context->buffer[SHA512_SHORT_BLOCK_LENGTH + 8] = context->bitcount[0]; + + /* Final transform: */ + SHA512_Transform(context, (sha2_word64*)context->buffer); +} + +int sha512_final(SHA512_CTX* context, sha2_byte digest[SHA512_DIGEST_LENGTH]) { + sha2_word64* d = (sha2_word64*)digest; + + /* Sanity check: */ + assert(context != (SHA512_CTX*)0); + + /* If no digest buffer is passed, we don't bother doing this: */ + if (digest != (sha2_byte*)0) { + SHA512_Last(context); + + /* Save the hash data for output: */ +#if BYTE_ORDER == LITTLE_ENDIAN + { + /* Convert TO host byte order */ + int j; + for (j = 0; j < 8; j++) { + REVERSE64(context->state[j], context->state[j]); + *d++ = context->state[j]; + } + } +#else + MEMCPY_BCOPY(d, context->state, SHA512_DIGEST_LENGTH); +#endif + } + + /* Zero out state data */ + free(context); + return SHA512_DIGEST_LENGTH; +} + +/*** SHA-384: *********************************************************/ +void* sha384_init() { + SHA384_CTX* context; + context = malloc(sizeof(SHA384_CTX)); + if (!context) + return NULL; + MEMCPY_BCOPY(context->state, sha384_initial_hash_value, SHA512_DIGEST_LENGTH); + MEMSET_BZERO(context->buffer, SHA384_BLOCK_LENGTH); + context->bitcount[0] = context->bitcount[1] = 0; + return context; +} + +void sha384_update(SHA384_CTX* context, const sha2_byte* data, size_t len) { + sha512_update((SHA512_CTX*)context, data, len); +} + +int sha384_final(SHA384_CTX* context, sha2_byte digest[SHA384_DIGEST_LENGTH]) { + sha2_word64* d = (sha2_word64*)digest; + + /* Sanity check: */ + assert(context != (SHA384_CTX*)0); + + /* If no digest buffer is passed, we don't bother doing this: */ + if (digest != (sha2_byte*)0) { + SHA512_Last((SHA512_CTX*)context); + + /* Save the hash data for output: */ +#if BYTE_ORDER == LITTLE_ENDIAN + { + /* Convert TO host byte order */ + int j; + for (j = 0; j < 6; j++) { + REVERSE64(context->state[j], context->state[j]); + *d++ = context->state[j]; + } + } +#else + MEMCPY_BCOPY(d, context->state, SHA384_DIGEST_LENGTH); +#endif + } + + /* Zero out state data */ + free(context); + return SHA384_DIGEST_LENGTH; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/sha2.h b/libsql-ffi/bundled/sqlean/crypto/sha2.h new file mode 100644 index 0000000000..853d9b1a72 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/sha2.h @@ -0,0 +1,96 @@ +/* + * FILE: sha2.h + * AUTHOR: Aaron D. Gifford - http://www.aarongifford.com/ + * + * Copyright (c) 2000-2001, Aaron D. Gifford + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the copyright holder nor the names of contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTOR(S) ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTOR(S) BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: sha2.h,v 1.1 2001/11/08 00:02:01 adg Exp adg $ + */ + +#ifndef __SHA2_H__ +#define __SHA2_H__ + +#define SHA2_USE_INTTYPES_H +#define SHA2_UNROLL_TRANSFORM +#define NOPROTO + +/* + * Import u_intXX_t size_t type definitions from system headers. You + * may need to change this, or define these things yourself in this + * file. + */ +#include + +#ifdef SHA2_USE_INTTYPES_H + +#include + +#endif /* SHA2_USE_INTTYPES_H */ + +/*** SHA-256/384/512 Various Length Definitions ***********************/ +#define SHA256_BLOCK_LENGTH 64 +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) +#define SHA384_BLOCK_LENGTH 128 +#define SHA384_DIGEST_LENGTH 48 +#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) +#define SHA512_BLOCK_LENGTH 128 +#define SHA512_DIGEST_LENGTH 64 +#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) + +/*** SHA-256/384/512 Context Structures *******************************/ + +typedef struct _SHA256_CTX { + uint32_t state[8]; + uint64_t bitcount; + uint8_t buffer[SHA256_BLOCK_LENGTH]; +} SHA256_CTX; + +typedef struct _SHA512_CTX { + uint64_t state[8]; + uint64_t bitcount[2]; + uint8_t buffer[SHA512_BLOCK_LENGTH]; +} SHA512_CTX; + +typedef SHA512_CTX SHA384_CTX; + +/*** SHA-256/384/512 Function Prototypes ******************************/ + +void* sha256_init(); +void sha256_update(SHA256_CTX*, const uint8_t*, size_t); +int sha256_final(SHA256_CTX*, uint8_t[SHA256_DIGEST_LENGTH]); + +void* sha384_init(); +void sha384_update(SHA384_CTX*, const uint8_t*, size_t); +int sha384_final(SHA384_CTX*, uint8_t[SHA384_DIGEST_LENGTH]); + +void* sha512_init(); +void sha512_update(SHA512_CTX*, const uint8_t*, size_t); +int sha512_final(SHA512_CTX*, uint8_t[SHA512_DIGEST_LENGTH]); + +#endif // MD5_H diff --git a/libsql-ffi/bundled/sqlean/crypto/url.c b/libsql-ffi/bundled/sqlean/crypto/url.c new file mode 100644 index 0000000000..2613b61ed3 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/url.c @@ -0,0 +1,82 @@ +// Originally by Fränz Friederes, MIT License +// https://github.com/cryptii/cryptii/blob/main/src/Encoder/URL.js + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// URL-escape encoding/decoding + +#include +#include +#include +#include +#include + +const char* url_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~"; + +uint8_t hex_to_ascii(char c) { + if (isdigit(c)) { + return c - '0'; + } else { + return tolower(c) - 'a' + 10; + } +} + +uint8_t* url_encode(const uint8_t* src, size_t len, size_t* out_len) { + size_t encoded_len = 0; + for (size_t i = 0; i < len; i++) { + if (strchr(url_chars, src[i]) == NULL) { + encoded_len += 3; + } else { + encoded_len += 1; + } + } + + uint8_t* encoded = malloc(encoded_len + 1); + if (encoded == NULL) { + *out_len = 0; + return NULL; + } + + size_t pos = 0; + for (size_t i = 0; i < len; i++) { + if (strchr(url_chars, src[i]) == NULL) { + encoded[pos++] = '%'; + encoded[pos++] = "0123456789ABCDEF"[src[i] >> 4]; + encoded[pos++] = "0123456789ABCDEF"[src[i] & 0x0F]; + } else { + encoded[pos++] = src[i]; + } + } + encoded[pos] = '\0'; + + *out_len = pos; + return encoded; +} + +uint8_t* url_decode(const uint8_t* src, size_t len, size_t* out_len) { + uint8_t* decoded = malloc(len); + if (decoded == NULL) { + *out_len = 0; + return NULL; + } + + size_t pos = 0; + for (size_t i = 0; i < len; i++) { + if (src[i] == '%') { + if (i + 2 >= len || !isxdigit(src[i + 1]) || !isxdigit(src[i + 2])) { + free(decoded); + return NULL; + } + decoded[pos++] = (hex_to_ascii(src[i + 1]) << 4) | hex_to_ascii(src[i + 2]); + i += 2; + } else if (src[i] == '+') { + decoded[pos++] = ' '; + } else { + decoded[pos++] = src[i]; + } + } + + *out_len = pos; + return decoded; +} diff --git a/libsql-ffi/bundled/sqlean/crypto/url.h b/libsql-ffi/bundled/sqlean/crypto/url.h new file mode 100644 index 0000000000..5dc9955e2d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/crypto/url.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// URL-escape encoding/decoding + +#ifndef _URL_H_ +#define _URL_H_ + +#include +#include + +uint8_t* url_encode(const uint8_t* src, size_t len, size_t* out_len); +uint8_t* url_decode(const uint8_t* src, size_t len, size_t* out_len); + +#endif /* _URL_H_ */ diff --git a/libsql-ffi/bundled/sqlean/define/define.h b/libsql-ffi/bundled/sqlean/define/define.h new file mode 100644 index 0000000000..af9a8a568b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/define/define.h @@ -0,0 +1,17 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// User-defined functions in SQLite. + +#ifndef DEFINE_INTERNAL_H +#define DEFINE_INTERNAL_H + +#include "sqlite3ext.h" + +int define_save_function(sqlite3* db, const char* name, const char* type, const char* body); + +int define_eval_init(sqlite3* db); +int define_manage_init(sqlite3* db); +int define_module_init(sqlite3* db); + +#endif /* DEFINE_INTERNAL_H */ diff --git a/libsql-ffi/bundled/sqlean/define/eval.c b/libsql-ffi/bundled/sqlean/define/eval.c new file mode 100644 index 0000000000..0475ad0bd3 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/define/eval.c @@ -0,0 +1,107 @@ +// Created by by D. Richard Hipp, Public Domain +// https://www.sqlite.org/src/file/ext/misc/eval.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// Evaluate dynamic SQL. + +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +/* + * Structure used to accumulate the output + */ +struct EvalResult { + char* z; /* Accumulated output */ + const char* zSep; /* Separator */ + int szSep; /* Size of the separator string */ + sqlite3_int64 nAlloc; /* Number of bytes allocated for z[] */ + sqlite3_int64 nUsed; /* Number of bytes of z[] actually used */ +}; + +/* + * Callback from sqlite_exec() for the eval() function. + */ +static int eval_callback(void* pCtx, int argc, char** argv, char** colnames) { + struct EvalResult* p = (struct EvalResult*)pCtx; + int i; + if (argv == 0) { + return SQLITE_OK; + } + for (i = 0; i < argc; i++) { + const char* z = argv[i] ? argv[i] : ""; + size_t sz = strlen(z); + if ((sqlite3_int64)sz + p->nUsed + p->szSep + 1 > p->nAlloc) { + char* zNew; + p->nAlloc = p->nAlloc * 2 + sz + p->szSep + 1; + /* Using sqlite3_realloc64() would be better, but it is a recent + ** addition and will cause a segfault if loaded by an older version + ** of SQLite. */ + zNew = p->nAlloc <= 0x7fffffff ? sqlite3_realloc64(p->z, p->nAlloc) : 0; + if (zNew == 0) { + sqlite3_free(p->z); + memset(p, 0, sizeof(*p)); + return SQLITE_NOMEM; + } + p->z = zNew; + } + if (p->nUsed > 0) { + memcpy(&p->z[p->nUsed], p->zSep, p->szSep); + p->nUsed += p->szSep; + } + memcpy(&p->z[p->nUsed], z, sz); + p->nUsed += sz; + } + return SQLITE_OK; +} + +/* + * Implementation of the eval(X) and eval(X,Y) SQL functions. + * + * Evaluate the SQL text in X. Return the results, using string + * Y as the separator. If Y is omitted, use a single space character. + */ +static void define_eval(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* zSql; + sqlite3* db; + char* zErr = 0; + int rc; + struct EvalResult x; + + memset(&x, 0, sizeof(x)); + x.zSep = " "; + zSql = (const char*)sqlite3_value_text(argv[0]); + if (zSql == 0) { + return; + } + if (argc > 1) { + x.zSep = (const char*)sqlite3_value_text(argv[1]); + if (x.zSep == 0) { + return; + } + } + x.szSep = (int)strlen(x.zSep); + db = sqlite3_context_db_handle(context); + rc = sqlite3_exec(db, zSql, eval_callback, &x, &zErr); + if (rc != SQLITE_OK) { + sqlite3_result_error(context, zErr, -1); + sqlite3_free(zErr); + } else if (x.zSep == 0) { + sqlite3_result_error_nomem(context); + sqlite3_free(x.z); + } else { + sqlite3_result_text(context, x.z, (int)x.nUsed, sqlite3_free); + } +} + +int define_eval_init(sqlite3* db) { + const int flags = SQLITE_UTF8 | SQLITE_DIRECTONLY; + sqlite3_create_function(db, "eval", 1, flags, NULL, define_eval, NULL, NULL); + sqlite3_create_function(db, "eval", 2, flags, NULL, define_eval, NULL, NULL); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/define/extension.c b/libsql-ffi/bundled/sqlean/define/extension.c new file mode 100644 index 0000000000..b5403be9e0 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/define/extension.c @@ -0,0 +1,16 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// User-defined functions in SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "define/define.h" + +int define_init(sqlite3* db) { + int status = define_manage_init(db); + define_eval_init(db); + define_module_init(db); + return status; +} diff --git a/libsql-ffi/bundled/sqlean/define/extension.h b/libsql-ffi/bundled/sqlean/define/extension.h new file mode 100644 index 0000000000..8c036fea03 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/define/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// User-defined functions in SQLite. + +#ifndef DEFINE_EXTENSION_H +#define DEFINE_EXTENSION_H + +#include "sqlite3ext.h" + +int define_init(sqlite3* db); + +#endif /* DEFINE_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/define/manage.c b/libsql-ffi/bundled/sqlean/define/manage.c new file mode 100644 index 0000000000..6b5a5d4862 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/define/manage.c @@ -0,0 +1,331 @@ +// Copyright (c) 2022 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Manage defined functions. + +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#define DEFINE_CACHE 2 + +#pragma region statement cache + +typedef struct cache_node { + sqlite3_stmt* stmt; + struct cache_node* next; +} cache_node; + +static cache_node* cache_head = NULL; +static cache_node* cache_tail = NULL; + +static int cache_add(sqlite3_stmt* stmt) { + if (cache_head == NULL) { + cache_head = (cache_node*)malloc(sizeof(cache_node)); + if (cache_head == NULL) { + return SQLITE_ERROR; + } + cache_head->stmt = stmt; + cache_head->next = NULL; + cache_tail = cache_head; + return SQLITE_OK; + } + cache_tail->next = (cache_node*)malloc(sizeof(cache_node)); + if (cache_tail->next == NULL) { + return SQLITE_ERROR; + } + cache_tail = cache_tail->next; + cache_tail->stmt = stmt; + cache_tail->next = NULL; + return SQLITE_OK; +} + +static void cache_print() { + if (cache_head == NULL) { + printf("cache is empty"); + return; + } + cache_node* curr = cache_head; + while (curr != NULL) { + printf("%s\n", sqlite3_sql(curr->stmt)); + curr = curr->next; + } +} + +static void cache_free() { + if (cache_head == NULL) { + return; + } + cache_node* prev; + cache_node* curr = cache_head; + while (curr != NULL) { + sqlite3_finalize(curr->stmt); + prev = curr; + curr = curr->next; + free(prev); + } + cache_head = cache_tail = NULL; +} + +/* + * Prints prepared statements cache contents. + */ +static void define_cache(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + cache_print(); +} + +#pragma endregion + +/* + * Saves user-defined function into the database. + */ +int define_save_function(sqlite3* db, const char* name, const char* type, const char* body) { + char* sql = + "insert into sqlean_define(name, type, body) values (?, ?, ?) " + "on conflict do nothing"; + sqlite3_stmt* stmt; + int ret = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL); + if (ret != SQLITE_OK) { + return ret; + } + sqlite3_bind_text(stmt, 1, name, -1, NULL); + sqlite3_bind_text(stmt, 2, type, -1, NULL); + sqlite3_bind_text(stmt, 3, body, -1, NULL); + ret = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (ret != SQLITE_DONE) { + return ret; + } + return SQLITE_OK; +} + +// no cache at all +#if DEFINE_CACHE == 0 + +/* + * Executes user-defined sql from the context. + */ +static void define_exec(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + int ret = SQLITE_OK; + char* sql = sqlite3_user_data(ctx); + sqlite3_stmt* stmt; + // sqlite3_close requires all prepared statements to be closed before destroying functions, so + // we have to re-create this every call + if ((ret = sqlite3_prepare_v2(sqlite3_context_db_handle(ctx), sql, -1, &stmt, NULL)) != + SQLITE_OK) { + sqlite3_result_error_code(ctx, ret); + return; + } + for (int i = 0; i < argc; i++) + if ((ret = sqlite3_bind_value(stmt, i + 1, argv[i])) != SQLITE_OK) + goto end; + if ((ret = sqlite3_step(stmt)) != SQLITE_ROW) { + if (ret == SQLITE_DONE) + ret = SQLITE_MISUSE; + goto end; + } + sqlite3_result_value(ctx, sqlite3_column_value(stmt, 0)); + +end: + sqlite3_finalize(stmt); + if (ret != SQLITE_ROW) + sqlite3_result_error_code(ctx, ret); +} + +/* + * Creates user-defined function without caching the prepared statement. + */ +static int define_create(sqlite3* db, const char* name, const char* body) { + char* sql = sqlite3_mprintf("select %s", body); + if (!sql) { + return SQLITE_NOMEM; + } + + sqlite3_stmt* stmt; + int ret = sqlite3_prepare_v3(db, sql, -1, SQLITE_PREPARE_PERSISTENT, &stmt, NULL); + if (ret != SQLITE_OK) { + sqlite3_free(sql); + return ret; + } + int nparams = sqlite3_bind_parameter_count(stmt); + sqlite3_finalize(stmt); + + return sqlite3_create_function_v2(db, name, nparams, SQLITE_UTF8, sql, define_exec, NULL, NULL, + sqlite3_free); +} + +/* + * Creates user-defined function and saves it to the database. + */ +static void define_function(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + sqlite3* db = sqlite3_context_db_handle(ctx); + const char* name = (const char*)sqlite3_value_text(argv[0]); + const char* body = (const char*)sqlite3_value_text(argv[1]); + int ret; + if ((ret = define_create(db, name, body)) != SQLITE_OK) { + sqlite3_result_error_code(ctx, ret); + return; + } + if ((ret = define_save_function(db, name, "scalar", body)) != SQLITE_OK) { + sqlite3_result_error_code(ctx, ret); + return; + } +} + +/* + * No-op as nothing is cached. + */ +static void define_free(sqlite3_context* ctx, int argc, sqlite3_value** argv) {} + +// custom cache +#elif DEFINE_CACHE == 2 + +/* + * Executes compiled prepared statement from the context. + */ +static void define_exec(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + int ret = SQLITE_OK; + sqlite3_stmt* stmt = sqlite3_user_data(ctx); + for (int i = 0; i < argc; i++) { + if ((ret = sqlite3_bind_value(stmt, i + 1, argv[i])) != SQLITE_OK) { + sqlite3_reset(stmt); + sqlite3_result_error_code(ctx, ret); + return; + } + } + if ((ret = sqlite3_step(stmt)) != SQLITE_ROW) { + if (ret == SQLITE_DONE) { + ret = SQLITE_MISUSE; + } + sqlite3_reset(stmt); + sqlite3_result_error_code(ctx, ret); + return; + } + sqlite3_result_value(ctx, sqlite3_column_value(stmt, 0)); + sqlite3_reset(stmt); +} + +/* + * Creates user-defined function and caches the prepared statement. + */ +static int define_create(sqlite3* db, const char* name, const char* body) { + char* sql = sqlite3_mprintf("select %s", body); + if (!sql) { + return SQLITE_NOMEM; + } + + sqlite3_stmt* stmt; + int ret = sqlite3_prepare_v3(db, sql, -1, SQLITE_PREPARE_PERSISTENT, &stmt, NULL); + sqlite3_free(sql); + if (ret != SQLITE_OK) { + return ret; + } + int nparams = sqlite3_bind_parameter_count(stmt); + // We are going to cache the statement in the function constructor and retrieve it later + // when executing the function, using sqlite3_user_data(). But relying on this internal cache + // is not enough. + // + // SQLite requires all prepared statements to be closed before calling the function destructor + // when closing the connection. So we can't close the statement in the function destructor. + // We have to cache it in the external cache and ask the user to manually free it + // before closing the connection. + // + // Alternatively, we can cache via the sqlite3_set_auxdata() with a negative slot, + // but that seems rather hacky. + if ((ret = cache_add(stmt)) != SQLITE_OK) { + return ret; + } + + return sqlite3_create_function(db, name, nparams, SQLITE_UTF8, stmt, define_exec, NULL, NULL); +} + +/* + * Creates compiled user-defined function and saves it to the database. + */ +static void define_function(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + sqlite3* db = sqlite3_context_db_handle(ctx); + const char* name = (const char*)sqlite3_value_text(argv[0]); + const char* body = (const char*)sqlite3_value_text(argv[1]); + int ret; + if ((ret = define_create(db, name, body)) != SQLITE_OK) { + sqlite3_result_error_code(ctx, ret); + return; + } + if ((ret = define_save_function(db, name, "scalar", body)) != SQLITE_OK) { + sqlite3_result_error_code(ctx, ret); + return; + } +} + +/* + * Frees prepared statements compiled by user-defined functions. + */ +static void define_free(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + cache_free(); +} + +#endif // DEFINE_CACHE + +/* + * Deletes user-defined function (scalar or table-valued) + */ +static void define_undefine(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + char* template = + "delete from sqlean_define where name = '%q';" + "drop table if exists \"%w\";"; + const char* name = (const char*)sqlite3_value_text(argv[0]); + char* sql = sqlite3_mprintf(template, name, name); + if (!sql) { + sqlite3_result_error_code(ctx, SQLITE_NOMEM); + return; + } + + sqlite3* db = sqlite3_context_db_handle(ctx); + int ret = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (ret != SQLITE_OK) { + sqlite3_result_error_code(ctx, ret); + } + sqlite3_free(sql); +} + +/* + * Loads user-defined functions from the database. + */ +static int define_load(sqlite3* db) { + char* sql = + "create table if not exists sqlean_define" + "(name text primary key, type text, body text)"; + int ret = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (ret != SQLITE_OK) { + return ret; + } + + sqlite3_stmt* stmt; + sql = "select name, body from sqlean_define where type = 'scalar'"; + if ((ret = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL)) != SQLITE_OK) { + return ret; + } + + const char* name; + const char* body; + while (sqlite3_step(stmt) != SQLITE_DONE) { + name = (const char*)sqlite3_column_text(stmt, 0); + body = (const char*)sqlite3_column_text(stmt, 1); + ret = define_create(db, name, body); + if (ret != SQLITE_OK) { + break; + } + } + return sqlite3_finalize(stmt); +} + +int define_manage_init(sqlite3* db) { + const int flags = SQLITE_UTF8 | SQLITE_DIRECTONLY; + sqlite3_create_function(db, "define", 2, flags, NULL, define_function, NULL, NULL); + sqlite3_create_function(db, "define_free", 0, flags, NULL, define_free, NULL, NULL); + sqlite3_create_function(db, "define_cache", 0, flags, NULL, define_cache, NULL, NULL); + sqlite3_create_function(db, "undefine", 1, flags, NULL, define_undefine, NULL, NULL); + return define_load(db); +} diff --git a/libsql-ffi/bundled/sqlean/define/module.c b/libsql-ffi/bundled/sqlean/define/module.c new file mode 100644 index 0000000000..17b3567ad9 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/define/module.c @@ -0,0 +1,346 @@ +// Created by 0x09, Public Domain +// https://github.com/0x09/sqlite-statement-vtab/blob/master/statement_vtab.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// Define table-valued functions. + +#include +#include +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "define/define.h" + +struct define_vtab { + sqlite3_vtab base; + sqlite3* db; + char* sql; + size_t sql_len; + int num_inputs; + int num_outputs; +}; + +struct define_cursor { + sqlite3_vtab_cursor base; + sqlite3_stmt* stmt; + int rowid; + int param_argc; + sqlite3_value** param_argv; +}; + +static char* build_create_statement(sqlite3_stmt* stmt) { + sqlite3_str* sql = sqlite3_str_new(NULL); + sqlite3_str_appendall(sql, "CREATE TABLE x( "); + for (int i = 0, nout = sqlite3_column_count(stmt); i < nout; i++) { + const char* name = sqlite3_column_name(stmt, i); + if (!name) { + sqlite3_free(sqlite3_str_finish(sql)); + return NULL; + } + const char* type = sqlite3_column_decltype(stmt, i); + sqlite3_str_appendf(sql, "%Q %s,", name, (type ? type : "")); + } + for (int i = 0, nargs = sqlite3_bind_parameter_count(stmt); i < nargs; i++) { + const char* name = sqlite3_bind_parameter_name(stmt, i + 1); + if (name) + sqlite3_str_appendf(sql, "%Q hidden,", name + 1); + else + sqlite3_str_appendf(sql, "'%d' hidden,", i + 1); + } + if (sqlite3_str_length(sql)) + sqlite3_str_value(sql)[sqlite3_str_length(sql) - 1] = ')'; + return sqlite3_str_finish(sql); +} + +static int define_vtab_destroy(sqlite3_vtab* pVTab) { + sqlite3_free(((struct define_vtab*)pVTab)->sql); + sqlite3_free(pVTab); + return SQLITE_OK; +} + +static int define_vtab_create(sqlite3* db, + void* pAux, + int argc, + const char* const* argv, + sqlite3_vtab** ppVtab, + char** pzErr) { + size_t len; + if (argc < 4 || (len = strlen(argv[3])) < 3) { + if (!(*pzErr = sqlite3_mprintf("no statement provided"))) + return SQLITE_NOMEM; + return SQLITE_MISUSE; + } + if (argv[3][0] != '(' || argv[3][len - 1] != ')') { + if (!(*pzErr = sqlite3_mprintf("statement must be parenthesized"))) + return SQLITE_NOMEM; + return SQLITE_MISUSE; + } + + int ret; + sqlite3_stmt* stmt = NULL; + char* create = NULL; + + struct define_vtab* vtab = sqlite3_malloc64(sizeof(*vtab)); + if (!vtab) { + return SQLITE_NOMEM; + } + memset(vtab, 0, sizeof(*vtab)); + *ppVtab = &vtab->base; + + vtab->db = db; + vtab->sql_len = len - 2; + if (!(vtab->sql = sqlite3_mprintf("%.*s", vtab->sql_len, argv[3] + 1))) { + ret = SQLITE_NOMEM; + goto error; + } + + ret = sqlite3_prepare_v2(db, vtab->sql, vtab->sql_len, &stmt, NULL); + if (ret != SQLITE_OK) { + goto sqlite_error; + } + + if (!sqlite3_stmt_readonly(stmt)) { + ret = SQLITE_ERROR; + if (!(*pzErr = sqlite3_mprintf("Statement must be read only."))) + ret = SQLITE_NOMEM; + goto error; + } + + vtab->num_inputs = sqlite3_bind_parameter_count(stmt); + vtab->num_outputs = sqlite3_column_count(stmt); + + if (!(create = build_create_statement(stmt))) { + ret = SQLITE_NOMEM; + goto error; + } + + if ((ret = sqlite3_declare_vtab(db, create)) != SQLITE_OK) { + goto sqlite_error; + } + + if ((ret = define_save_function(db, argv[2], "table", argv[3])) != SQLITE_OK) { + goto error; + } + + sqlite3_free(create); + sqlite3_finalize(stmt); + return SQLITE_OK; + +sqlite_error: + if (!(*pzErr = sqlite3_mprintf("%s", sqlite3_errmsg(db)))) + ret = SQLITE_NOMEM; +error: + sqlite3_free(create); + sqlite3_finalize(stmt); + define_vtab_destroy(*ppVtab); + *ppVtab = NULL; + return ret; +} + +// if these point to the literal same function sqlite makes define_vtab eponymous, which we don't +// want +static int define_vtab_connect(sqlite3* db, + void* pAux, + int argc, + const char* const* argv, + sqlite3_vtab** ppVtab, + char** pzErr) { + return define_vtab_create(db, pAux, argc, argv, ppVtab, pzErr); +} + +static int define_vtab_open(sqlite3_vtab* pVTab, sqlite3_vtab_cursor** ppCursor) { + struct define_vtab* vtab = (struct define_vtab*)pVTab; + struct define_cursor* cur = sqlite3_malloc64(sizeof(*cur)); + if (!cur) + return SQLITE_NOMEM; + + *ppCursor = &cur->base; + cur->param_argv = sqlite3_malloc(sizeof(*cur->param_argv) * vtab->num_inputs); + return sqlite3_prepare_v2(vtab->db, vtab->sql, vtab->sql_len, &cur->stmt, NULL); +} + +static int define_vtab_close(sqlite3_vtab_cursor* cur) { + struct define_cursor* stmtcur = (struct define_cursor*)cur; + sqlite3_finalize(stmtcur->stmt); + sqlite3_free(stmtcur->param_argv); + sqlite3_free(cur); + return SQLITE_OK; +} + +static int define_vtab_next(sqlite3_vtab_cursor* cur) { + struct define_cursor* stmtcur = (struct define_cursor*)cur; + int ret = sqlite3_step(stmtcur->stmt); + if (ret == SQLITE_ROW) { + stmtcur->rowid++; + return SQLITE_OK; + } + return ret == SQLITE_DONE ? SQLITE_OK : ret; +} + +static int define_vtab_rowid(sqlite3_vtab_cursor* cur, sqlite_int64* pRowid) { + *pRowid = ((struct define_cursor*)cur)->rowid; + return SQLITE_OK; +} + +static int define_vtab_eof(sqlite3_vtab_cursor* cur) { + return !sqlite3_stmt_busy(((struct define_cursor*)cur)->stmt); +} + +static int define_vtab_column(sqlite3_vtab_cursor* cur, sqlite3_context* ctx, int i) { + struct define_cursor* stmtcur = (struct define_cursor*)cur; + int num_outputs = ((struct define_vtab*)cur->pVtab)->num_outputs; + if (i < num_outputs) + sqlite3_result_value(ctx, sqlite3_column_value(stmtcur->stmt, i)); + else if (i - num_outputs < stmtcur->param_argc) + sqlite3_result_value(ctx, stmtcur->param_argv[i - num_outputs]); + return SQLITE_OK; +} + +// parameter map encoding for xBestIndex/xFilter +// constraint -> param index mappings are stored in idxStr when not contiguous. idxStr is expected +// to be NUL terminated and printable, so we use a 6 bit encoding in the ASCII range. for simplicity +// encoded indexes are fixed to the length necessary to encode an int. this is overkill on most +// systems due to sqlite's current hard limit on number of columns but makes define_vtab agnostic +// to changes to this limit +const static size_t param_idx_size = (sizeof(int) * CHAR_BIT + 5) / 6; + +static inline void encode_param_idx(int i, char* restrict param_map, int param_idx) { + assert(param_idx >= 0); + for (size_t j = 0; j < param_idx_size; j++) + param_map[i * param_idx_size + j] = ((param_idx >> 6 * j) & 63) + 33; +} + +static inline int decode_param_idx(int i, const char* param_map) { + int param_idx = 0; + for (size_t j = 0; j < param_idx_size; j++) + param_idx |= (param_map[i * param_idx_size + j] - 33) << 6 * j; + return param_idx; +} + +// xBestIndex needs to communicate which columns are constrained by the where clause to xFilter; +// in terms of a statement table this translates to which parameters will be available to bind. +static int define_vtab_filter(sqlite3_vtab_cursor* cur, + int idxNum, + const char* idxStr, + int argc, + sqlite3_value** argv) { + struct define_cursor* stmtcur = (struct define_cursor*)cur; + stmtcur->rowid = 1; + sqlite3_stmt* stmt = stmtcur->stmt; + sqlite3_reset(stmt); + sqlite3_clear_bindings(stmt); + + int ret; + for (int i = 0; i < argc; i++) { + int param_idx = idxStr ? decode_param_idx(i, idxStr) : i + 1; + if ((ret = sqlite3_bind_value(stmt, param_idx, argv[i])) != SQLITE_OK) + return ret; + } + ret = sqlite3_step(stmt); + if (!(ret == SQLITE_ROW || ret == SQLITE_DONE)) + return ret; + + assert(((struct define_vtab*)cur->pVtab)->num_inputs >= argc); + if ((stmtcur->param_argc = argc)) // shallow copy args as these are explicitly retained in + // sqlite3WhereCodeOneLoopStart + memcpy(stmtcur->param_argv, argv, sizeof(*stmtcur->param_argv) * argc); + + return SQLITE_OK; +} + +static int define_vtab_best_index(sqlite3_vtab* pVTab, sqlite3_index_info* index_info) { + int num_outputs = ((struct define_vtab*)pVTab)->num_outputs; + int out_constraints = 0; + index_info->orderByConsumed = 0; + index_info->estimatedCost = 1; + index_info->estimatedRows = 1; + int col_max = 0; + sqlite3_uint64 used_cols = 0; + for (int i = 0; i < index_info->nConstraint; i++) { + // skip if this is a constraint on one of our output columns + if (index_info->aConstraint[i].iColumn < num_outputs) + continue; + // a given query plan is only usable if all provided "input" columns are usable and have + // equal constraints only is this redundant / an EQ constraint ever unusable? + if (!index_info->aConstraint[i].usable || + index_info->aConstraint[i].op != SQLITE_INDEX_CONSTRAINT_EQ) + return SQLITE_CONSTRAINT; + + int col_index = index_info->aConstraint[i].iColumn - num_outputs; + index_info->aConstraintUsage[i].argvIndex = col_index + 1; + index_info->aConstraintUsage[i].omit = 1; + + if (col_index + 1 > col_max) + col_max = col_index + 1; + if (col_index < 64) + used_cols |= 1ull << col_index; + + out_constraints++; + } + + // if the constrained columns are contiguous then we can just tell sqlite to order the arg + // vector provided to xFilter in the same order as our column bindings, so there's no need to + // map between these (this will always be the case when calling the vtab as a table-valued + // function) only support this optimization for up to 64 constrained columns since checking for + // continuity more generally would cost nearly as much as just allocating the mapping + sqlite_uint64 required_cols = (col_max < 64 ? 1ull << col_max : 0ull) - 1; + if (!out_constraints || + (col_max <= 64 && used_cols == required_cols && out_constraints == col_max)) + return SQLITE_OK; + + // otherwise map the constraint index as provided to xFilter to column index for bindings + // this will only be necessary when constraints are not contiguous e.g. where arg1 = x and arg3 + // = y in that case bound parameter indexes are encoded as a string in idxStr, in the order they + // appear in constriants + if ((size_t)out_constraints > (SIZE_MAX - 1) / param_idx_size) { + sqlite3_free(pVTab->zErrMsg); + if (!(pVTab->zErrMsg = + sqlite3_mprintf("Too many constraints to index: %d", out_constraints))) + return SQLITE_NOMEM; + return SQLITE_ERROR; + } + + if (!(index_info->idxStr = sqlite3_malloc64(out_constraints * param_idx_size + 1))) + return SQLITE_NOMEM; + + index_info->needToFreeIdxStr = 1; + + for (int i = 0, constraint_idx = 0; i < index_info->nConstraint; i++) { + if (!index_info->aConstraintUsage[i].argvIndex) + continue; + encode_param_idx(constraint_idx, index_info->idxStr, + index_info->aConstraintUsage[i].argvIndex); + index_info->aConstraintUsage[i].argvIndex = ++constraint_idx; + } + + index_info->idxStr[out_constraints * param_idx_size] = '\0'; + + return SQLITE_OK; +} + +static sqlite3_module define_module = { + .xCreate = define_vtab_create, + .xConnect = define_vtab_connect, + .xBestIndex = define_vtab_best_index, + .xDisconnect = define_vtab_destroy, + .xDestroy = define_vtab_destroy, + .xOpen = define_vtab_open, + .xClose = define_vtab_close, + .xFilter = define_vtab_filter, + .xNext = define_vtab_next, + .xEof = define_vtab_eof, + .xColumn = define_vtab_column, + .xRowid = define_vtab_rowid, +}; + +int define_module_init(sqlite3* db) { + sqlite3_create_module(db, "define", &define_module, NULL); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/fileio/extension.c b/libsql-ffi/bundled/sqlean/fileio/extension.c new file mode 100644 index 0000000000..8fac2e4e4e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fileio/extension.c @@ -0,0 +1,16 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Read and write files in SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "fileio/fileio.h" + +int fileio_init(sqlite3* db) { + fileio_scalar_init(db); + fileio_ls_init(db); + fileio_scan_init(db); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/fileio/extension.h b/libsql-ffi/bundled/sqlean/fileio/extension.h new file mode 100644 index 0000000000..05c8215c2c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fileio/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Read and write files in SQLite. + +#ifndef FILEIO_EXTENSION_H +#define FILEIO_EXTENSION_H + +#include "sqlite3ext.h" + +int fileio_init(sqlite3* db); + +#endif /* FILEIO_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/fileio/fileio.h b/libsql-ffi/bundled/sqlean/fileio/fileio.h new file mode 100644 index 0000000000..7f4c109234 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fileio/fileio.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Read and write files in SQLite. + +#ifndef FILEIO_INTERNAL_H +#define FILEIO_INTERNAL_H + +#include "sqlite3ext.h" + +int fileio_ls_init(sqlite3* db); +int fileio_scalar_init(sqlite3* db); +int fileio_scan_init(sqlite3* db); + +#endif /* FILEIO_INTERNAL_H */ diff --git a/libsql-ffi/bundled/sqlean/fileio/legacy.c b/libsql-ffi/bundled/sqlean/fileio/legacy.c new file mode 100644 index 0000000000..291bc39fa3 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fileio/legacy.c @@ -0,0 +1,1183 @@ +// Originally by D. Richard Hipp, Public Domain +// https://www.sqlite.org/src/file/ext/misc/fileio.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +/* + * This SQLite extension implements SQL functions + * for reading, writing and listing files and folders. + * + * + * Notes on building the extension for Windows: + * Unless linked statically with the SQLite library, a preprocessor + * symbol, FILEIO_WIN32_DLL, must be #define'd to create a stand-alone + * DLL form of this extension for WIN32. See its use below for details. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#if !defined(_WIN32) && !defined(WIN32) +#include +#include +#include +#include + +#else + +#if !defined(_MSC_VER) +#define _MSC_VER 1929 +#endif +#define FILEIO_WIN32_DLL +#include +#include +#include "test_windirent.h" +#include "windows.h" +#define dirent DIRENT + +#ifndef chmod +#define chmod _chmod +#endif + +#ifndef stat +#define stat _stat +#endif +#define mkdir(path, mode) _mkdir(path) +#define lstat(path, buf) stat(path, buf) + +#endif + +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +/* +** Structure of the fsdir() table-valued function +*/ +/* 0 1 2 3 4 5 */ +#define FSDIR_SCHEMA "(name,mode,mtime,size,path HIDDEN,dir HIDDEN)" +#define FSDIR_COLUMN_NAME 0 /* Name of the file */ +#define FSDIR_COLUMN_MODE 1 /* Access mode */ +#define FSDIR_COLUMN_MTIME 2 /* Last modification time */ +#define FSDIR_COLUMN_SIZE 3 /* File size */ +#define FSDIR_COLUMN_PATH 4 /* Path to top of search */ +#define FSDIR_COLUMN_REC 5 /* Recursive flag */ + +/* +** Set the result stored by context ctx to a blob containing the +** contents of file zName. Or, leave the result unchanged (NULL) +** if the file does not exist or is unreadable. +** +** If the file exceeds the SQLite blob size limit, through an +** SQLITE_TOOBIG error. +** +** Throw an SQLITE_IOERR if there are difficulties pulling the file +** off of disk. +*/ +static void readFileContents(sqlite3_context* ctx, + const char* zName, + const int nOffset, + const int nLimit) { + FILE* in; + sqlite3_int64 nIn; + void* pBuf; + sqlite3* db; + int mxBlob; + + assert(nOffset >= 0); + assert(nLimit >= 0); + + in = fopen(zName, "rb"); + if (in == 0) { + /* File does not exist or is unreadable. Leave the result set to NULL. */ + return; + } + fseek(in, 0, SEEK_END); + nIn = ftell(in); + rewind(in); + + if (nOffset > nIn) { + /* offset is greater than the size of the file */ + sqlite3_result_zeroblob(ctx, 0); + fclose(in); + return; + } + if (nOffset > 0) { + fseek(in, nOffset, SEEK_SET); + nIn -= nOffset; + } + + if (nLimit > 0 && nLimit < nIn) { + nIn = nLimit; + } + + db = sqlite3_context_db_handle(ctx); + mxBlob = sqlite3_limit(db, SQLITE_LIMIT_LENGTH, -1); + if (nIn > mxBlob) { + sqlite3_result_error_code(ctx, SQLITE_TOOBIG); + fclose(in); + return; + } + pBuf = sqlite3_malloc64(nIn ? nIn : 1); + if (pBuf == 0) { + sqlite3_result_error_nomem(ctx); + fclose(in); + return; + } + if (nIn == (sqlite3_int64)fread(pBuf, 1, (size_t)nIn, in)) { + sqlite3_result_blob64(ctx, pBuf, nIn, sqlite3_free); + } else { + sqlite3_result_error_code(ctx, SQLITE_IOERR); + sqlite3_free(pBuf); + } + fclose(in); +} + +/* +** Implementation of the "readfile(X)" SQL function. The entire content +** of the file named X is read and returned as a BLOB. NULL is returned +** if the file does not exist or is unreadable. +*/ +static void fileio_readfile(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* zName = (const char*)sqlite3_value_text(argv[0]); + if (zName == 0) { + return; + } + + int nOffset = 0; + if (argc >= 2 && sqlite3_value_type(argv[1]) != SQLITE_NULL) { + nOffset = sqlite3_value_int(argv[1]); + if (nOffset < 0) { + sqlite3_result_error(context, "offset must be >= 0", -1); + return; + } + } + + int nLimit = 0; + if (argc == 3 && sqlite3_value_type(argv[2]) != SQLITE_NULL) { + nLimit = sqlite3_value_int(argv[2]); + if (nLimit < 0) { + sqlite3_result_error(context, "limit must be >= 0", -1); + return; + } + } + + readFileContents(context, zName, nOffset, nLimit); +} + +/* +** Set the error message contained in context ctx to the results of +** vprintf(zFmt, ...). +*/ +static void ctxErrorMsg(sqlite3_context* ctx, const char* zFmt, ...) { + char* zMsg = 0; + va_list ap; + va_start(ap, zFmt); + zMsg = sqlite3_vmprintf(zFmt, ap); + sqlite3_result_error(ctx, zMsg, -1); + sqlite3_free(zMsg); + va_end(ap); +} + +#if defined(_WIN32) +/* +** This function is designed to convert a Win32 FILETIME structure into the +** number of seconds since the Unix Epoch (1970-01-01 00:00:00 UTC). +*/ +static sqlite3_uint64 fileTimeToUnixTime(LPFILETIME pFileTime) { + SYSTEMTIME epochSystemTime; + ULARGE_INTEGER epochIntervals; + FILETIME epochFileTime; + ULARGE_INTEGER fileIntervals; + + memset(&epochSystemTime, 0, sizeof(SYSTEMTIME)); + epochSystemTime.wYear = 1970; + epochSystemTime.wMonth = 1; + epochSystemTime.wDay = 1; + SystemTimeToFileTime(&epochSystemTime, &epochFileTime); + epochIntervals.LowPart = epochFileTime.dwLowDateTime; + epochIntervals.HighPart = epochFileTime.dwHighDateTime; + + fileIntervals.LowPart = pFileTime->dwLowDateTime; + fileIntervals.HighPart = pFileTime->dwHighDateTime; + + return (fileIntervals.QuadPart - epochIntervals.QuadPart) / 10000000; +} + +#if defined(FILEIO_WIN32_DLL) && (defined(_WIN32) || defined(WIN32)) +#/* To allow a standalone DLL, use this next replacement function: */ +#undef sqlite3_win32_utf8_to_unicode +#define sqlite3_win32_utf8_to_unicode utf8_to_utf16 +# +LPWSTR utf8_to_utf16(const char* z) { + int nAllot = MultiByteToWideChar(CP_UTF8, 0, z, -1, NULL, 0); + LPWSTR rv = sqlite3_malloc(nAllot * sizeof(WCHAR)); + if (rv != 0 && 0 < MultiByteToWideChar(CP_UTF8, 0, z, -1, rv, nAllot)) + return rv; + sqlite3_free(rv); + return 0; +} +#endif + +/* +** This function attempts to normalize the time values found in the stat() +** buffer to UTC. This is necessary on Win32, where the runtime library +** appears to return these values as local times. +*/ +static void statTimesToUtc(const char* zPath, struct stat* pStatBuf) { + HANDLE hFindFile; + WIN32_FIND_DATAW fd; + LPWSTR zUnicodeName; + extern LPWSTR sqlite3_win32_utf8_to_unicode(const char*); + zUnicodeName = sqlite3_win32_utf8_to_unicode(zPath); + if (zUnicodeName) { + memset(&fd, 0, sizeof(WIN32_FIND_DATAW)); + hFindFile = FindFirstFileW(zUnicodeName, &fd); + if (hFindFile != NULL) { + pStatBuf->st_ctime = (time_t)fileTimeToUnixTime(&fd.ftCreationTime); + pStatBuf->st_atime = (time_t)fileTimeToUnixTime(&fd.ftLastAccessTime); + pStatBuf->st_mtime = (time_t)fileTimeToUnixTime(&fd.ftLastWriteTime); + FindClose(hFindFile); + } + sqlite3_free(zUnicodeName); + } +} +#endif + +/* +** This function is used in place of stat(). On Windows, special handling +** is required in order for the included time to be returned as UTC. On all +** other systems, this function simply calls stat(). +*/ +static int fileStat(const char* zPath, struct stat* pStatBuf) { +#if defined(_WIN32) + int rc = stat(zPath, pStatBuf); + if (rc == 0) + statTimesToUtc(zPath, pStatBuf); + return rc; +#else + return stat(zPath, pStatBuf); +#endif +} + +/* +** This function is used in place of lstat(). On Windows, special handling +** is required in order for the included time to be returned as UTC. On all +** other systems, this function simply calls lstat(). +*/ +static int fileLinkStat(const char* zPath, struct stat* pStatBuf) { +#if defined(_WIN32) + int rc = lstat(zPath, pStatBuf); + if (rc == 0) + statTimesToUtc(zPath, pStatBuf); + return rc; +#else + return lstat(zPath, pStatBuf); +#endif +} + +/* +** Argument zFile is the name of a file that will be created and/or written +** by SQL function writefile(). This function ensures that the directory +** zFile will be written to exists, creating it if required. The permissions +** for any path components created by this function are set in accordance +** with the current umask. +** +** If an OOM condition is encountered, SQLITE_NOMEM is returned. Otherwise, +** SQLITE_OK is returned if the directory is successfully created, or +** SQLITE_ERROR otherwise. +*/ +static int makeParentDirectory(const char* zFile) { + char* zCopy = sqlite3_mprintf("%s", zFile); + int rc = SQLITE_OK; + + if (zCopy == 0) { + rc = SQLITE_NOMEM; + } else { + int nCopy = (int)strlen(zCopy); + int i = 1; + + while (rc == SQLITE_OK) { + struct stat sStat; + int rc2; + + for (; zCopy[i] != '/' && i < nCopy; i++) + ; + if (i == nCopy) + break; + zCopy[i] = '\0'; + + rc2 = fileStat(zCopy, &sStat); + if (rc2 != 0) { + if (mkdir(zCopy, 0777)) + rc = SQLITE_ERROR; + } else { + if (!S_ISDIR(sStat.st_mode)) + rc = SQLITE_ERROR; + } + zCopy[i] = '/'; + i++; + } + + sqlite3_free(zCopy); + } + + return rc; +} + +/* + * Creates a directory named `path` with permission bits `mode`. + */ +static int makeDirectory(sqlite3_context* ctx, const char* path, mode_t mode) { + int res = mkdir(path, mode); + if (res != 0) { + /* The mkdir() call to create the directory failed. This might not + ** be an error though - if there is already a directory at the same + ** path and either the permissions already match or can be changed + ** to do so using chmod(), it is not an error. */ + struct stat sStat; + if (errno != EEXIST || 0 != fileStat(path, &sStat) || !S_ISDIR(sStat.st_mode) || + ((sStat.st_mode & 0777) != (mode & 0777) && 0 != chmod(path, mode & 0777))) { + return 1; + } + } + return 0; +} + +/* + * Creates a symbolic link named `dst`, pointing to `src`. + */ +static int createSymlink(sqlite3_context* ctx, const char* src, const char* dst) { +#ifdef _WIN32 + return 0; +#else + int res = symlink(src, dst) < 0; + if (res < 0) { + return 1; + } + return 0; +#endif +} + +/* + * Writes blob `pData` to a file specified by `zFile`, + * with permission bits `mode` and modification time `mtime` (-1 to not set). + * Returns the number of written bytes. + */ +static int writeFile(sqlite3_context* pCtx, + const char* zFile, + sqlite3_value* pData, + mode_t mode, + sqlite3_int64 mtime) { + sqlite3_int64 nWrite = 0; + const char* z; + int rc = 0; + FILE* out = fopen(zFile, "wb"); + if (out == 0) + return 1; + z = (const char*)sqlite3_value_blob(pData); + if (z) { + sqlite3_int64 n = fwrite(z, 1, sqlite3_value_bytes(pData), out); + nWrite = sqlite3_value_bytes(pData); + if (nWrite != n) { + rc = 1; + } + } + fclose(out); + if (rc == 0 && mode && chmod(zFile, mode)) { + rc = 1; + } + if (rc) + return 2; + sqlite3_result_int64(pCtx, nWrite); + + if (mtime >= 0) { +#if defined(_WIN32) +#if !SQLITE_OS_WINRT + /* Windows */ + FILETIME lastAccess; + FILETIME lastWrite; + SYSTEMTIME currentTime; + LONGLONG intervals; + HANDLE hFile; + LPWSTR zUnicodeName; + extern LPWSTR sqlite3_win32_utf8_to_unicode(const char*); + + GetSystemTime(¤tTime); + SystemTimeToFileTime(¤tTime, &lastAccess); + intervals = Int32x32To64(mtime, 10000000) + 116444736000000000; + lastWrite.dwLowDateTime = (DWORD)intervals; + lastWrite.dwHighDateTime = intervals >> 32; + zUnicodeName = sqlite3_win32_utf8_to_unicode(zFile); + if (zUnicodeName == 0) { + return 1; + } + hFile = CreateFileW(zUnicodeName, FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, NULL); + sqlite3_free(zUnicodeName); + if (hFile != INVALID_HANDLE_VALUE) { + BOOL bResult = SetFileTime(hFile, NULL, &lastAccess, &lastWrite); + CloseHandle(hFile); + return !bResult; + } else { + return 1; + } +#endif +#elif defined(AT_FDCWD) && 0 /* utimensat() is not universally available */ + /* Recent unix */ + struct timespec times[2]; + times[0].tv_nsec = times[1].tv_nsec = 0; + times[0].tv_sec = time(0); + times[1].tv_sec = mtime; + if (utimensat(AT_FDCWD, zFile, times, AT_SYMLINK_NOFOLLOW)) { + return 1; + } +#else + /* Legacy unix */ + struct timeval times[2]; + times[0].tv_usec = times[1].tv_usec = 0; + times[0].tv_sec = time(0); + times[1].tv_sec = mtime; + if (utimes(zFile, times)) { + return 1; + } +#endif + } + + return 0; +} + +// Writes data to a file. +// writefile(path, data[, perm[, mtime]]) +static void fileio_writefile(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_int64 mtime = -1; + + if (argc < 2 || argc > 4) { + sqlite3_result_error(context, "wrong number of arguments to function writefile()", -1); + return; + } + + const char* zFile = (const char*)sqlite3_value_text(argv[0]); + if (zFile == 0) { + return; + } + + mode_t perm = 0666; + if (argc >= 3) { + perm = (mode_t)sqlite3_value_int(argv[2]); + } + + if (argc == 4) { + mtime = sqlite3_value_int64(argv[3]); + } + + int res = writeFile(context, zFile, argv[1], perm, mtime); + if (res == 1 && errno == ENOENT) { + if (makeParentDirectory(zFile) == SQLITE_OK) { + res = writeFile(context, zFile, argv[1], perm, mtime); + } + } + + if (argc > 2 && res != 0) { + ctxErrorMsg(context, "failed to write file: %s", zFile); + } +} + +// Appends string to a file specified by path. +// fileio_append(path, str) +static void fileio_append(sqlite3_context* ctx, int argc, sqlite3_value** argv) { + bool is_new_file = false; + FILE* file = sqlite3_get_auxdata(ctx, 0); + if (file == NULL) { + const char* path = (const char*)sqlite3_value_text(argv[0]); + file = fopen(path, "a"); + if (file == NULL && errno == ENOENT) { + // parent directory does not exist, let's create it + if (makeParentDirectory(path) == SQLITE_OK) { + file = fopen(path, "a"); + } + } + if (file == NULL) { + sqlite3_result_error(ctx, "failed to open file", -1); + return; + } + is_new_file = true; + } + + const char* str = (const char*)sqlite3_value_text(argv[1]); + int rc = fputs(str, file); + if (rc < 0) { + if (is_new_file) { + fclose(file); + } + sqlite3_result_error(ctx, "failed to append string to file", -1); + return; + } + + size_t n = strlen(str); + sqlite3_result_int(ctx, n); + + if (is_new_file) { + sqlite3_set_auxdata(ctx, 0, file, (void (*)(void*))fclose); + } +} + +// Creates a symlink. +// symlink(src, dst) +static void fileio_symlink(sqlite3_context* context, int argc, sqlite3_value** argv) { + if (argc != 2) { + sqlite3_result_error(context, "wrong number of arguments to function symlink()", -1); + return; + } + + const char* src = (const char*)sqlite3_value_text(argv[0]); + if (src == 0) { + return; + } + const char* dst = (const char*)sqlite3_value_text(argv[1]); + + int res = createSymlink(context, src, dst); + if (res != 0) { + ctxErrorMsg(context, "failed to create symlink to: %s", src); + } +} + +// Creates a directory. +// mkdir(path, perm) +static void fileio_mkdir(sqlite3_context* context, int argc, sqlite3_value** argv) { + if (argc != 1 && argc != 2) { + sqlite3_result_error(context, "wrong number of arguments to function mkdir()", -1); + return; + } + + const char* path = (const char*)sqlite3_value_text(argv[0]); + if (path == 0) { + return; + } + + mode_t perm = 0777; + if (argc == 2) { + perm = (mode_t)sqlite3_value_int(argv[1]); + } + + int res = makeDirectory(context, path, perm); + + if (res != 0) { + ctxErrorMsg(context, "failed to create directory: %s", path); + } +} + +// Given a numberic st_mode from stat(), convert it into a human-readable +// text string in the style of "ls -l". +// lsmode(mode) +static void fileio_lsmode(sqlite3_context* context, int argc, sqlite3_value** argv) { + int i; + int iMode = sqlite3_value_int(argv[0]); + char z[16]; + (void)argc; + if (S_ISLNK(iMode)) { + z[0] = 'l'; + } else if (S_ISREG(iMode)) { + z[0] = '-'; + } else if (S_ISDIR(iMode)) { + z[0] = 'd'; + } else { + z[0] = '?'; + } + for (i = 0; i < 3; i++) { + int m = (iMode >> ((2 - i) * 3)); + char* a = &z[1 + i * 3]; + a[0] = (m & 0x4) ? 'r' : '-'; + a[1] = (m & 0x2) ? 'w' : '-'; + a[2] = (m & 0x1) ? 'x' : '-'; + } + z[10] = '\0'; + sqlite3_result_text(context, z, -1, SQLITE_TRANSIENT); +} + +/* +** Cursor type for recursively iterating through a directory structure. +*/ +typedef struct fsdir_cursor fsdir_cursor; +typedef struct FsdirLevel FsdirLevel; + +struct FsdirLevel { + DIR* pDir; /* From opendir() */ + char* zDir; /* Name of directory (nul-terminated) */ +}; + +struct fsdir_cursor { + sqlite3_vtab_cursor base; /* Base class - must be first */ + + bool recursive; /* true to traverse dirs recursively, false otherwise */ + + int nLvl; /* Number of entries in aLvl[] array */ + int iLvl; /* Index of current entry */ + FsdirLevel* aLvl; /* Hierarchy of directories being traversed */ + + struct stat sStat; /* Current lstat() results */ + char* zPath; /* Path to current entry */ + sqlite3_int64 iRowid; /* Current rowid */ +}; + +typedef struct fsdir_tab fsdir_tab; +struct fsdir_tab { + sqlite3_vtab base; /* Base class - must be first */ +}; + +/* +** Construct a new fsdir virtual table object. +*/ +static int fsdirConnect(sqlite3* db, + void* pAux, + int argc, + const char* const* argv, + sqlite3_vtab** ppVtab, + char** pzErr) { + fsdir_tab* pNew = 0; + int rc; + (void)pAux; + (void)argc; + (void)argv; + (void)pzErr; + rc = sqlite3_declare_vtab(db, "CREATE TABLE x" FSDIR_SCHEMA); + if (rc == SQLITE_OK) { + pNew = (fsdir_tab*)sqlite3_malloc(sizeof(*pNew)); + if (pNew == 0) + return SQLITE_NOMEM; + memset(pNew, 0, sizeof(*pNew)); + sqlite3_vtab_config(db, SQLITE_VTAB_DIRECTONLY); + } + *ppVtab = (sqlite3_vtab*)pNew; + return rc; +} + +/* +** This method is the destructor for fsdir vtab objects. +*/ +static int fsdirDisconnect(sqlite3_vtab* pVtab) { + sqlite3_free(pVtab); + return SQLITE_OK; +} + +/* +** Constructor for a new fsdir_cursor object. +*/ +static int fsdirOpen(sqlite3_vtab* p, sqlite3_vtab_cursor** ppCursor) { + fsdir_cursor* pCur; + (void)p; + pCur = sqlite3_malloc(sizeof(*pCur)); + if (pCur == 0) + return SQLITE_NOMEM; + memset(pCur, 0, sizeof(*pCur)); + pCur->iLvl = -1; + *ppCursor = &pCur->base; + return SQLITE_OK; +} + +/* +** Reset a cursor back to the state it was in when first returned +** by fsdirOpen(). +*/ +static void fsdirResetCursor(fsdir_cursor* pCur) { + int i; + for (i = 0; i <= pCur->iLvl; i++) { + FsdirLevel* pLvl = &pCur->aLvl[i]; + if (pLvl->pDir) + closedir(pLvl->pDir); + sqlite3_free(pLvl->zDir); + } + sqlite3_free(pCur->zPath); + sqlite3_free(pCur->aLvl); + pCur->aLvl = 0; + pCur->zPath = 0; + pCur->nLvl = 0; + pCur->iLvl = -1; + pCur->iRowid = 1; +} + +/* +** Destructor for an fsdir_cursor. +*/ +static int fsdirClose(sqlite3_vtab_cursor* cur) { + fsdir_cursor* pCur = (fsdir_cursor*)cur; + + fsdirResetCursor(pCur); + sqlite3_free(pCur); + return SQLITE_OK; +} + +/* +** Set the error message for the virtual table associated with cursor +** pCur to the results of vprintf(zFmt, ...). +*/ +static void fsdirSetErrmsg(fsdir_cursor* pCur, const char* zFmt, ...) { + va_list ap; + va_start(ap, zFmt); + pCur->base.pVtab->zErrMsg = sqlite3_vmprintf(zFmt, ap); + va_end(ap); +} + +/* +** Advance an fsdir_cursor to its next row of output. +*/ +static int fsdirNext(sqlite3_vtab_cursor* cur) { + fsdir_cursor* pCur = (fsdir_cursor*)cur; + mode_t m = pCur->sStat.st_mode; + + pCur->iRowid++; + if (S_ISDIR(m) && (pCur->iLvl == -1 || pCur->recursive)) { + /* Descend into this directory */ + int iNew = pCur->iLvl + 1; + FsdirLevel* pLvl; + if (iNew >= pCur->nLvl) { + int nNew = iNew + 1; + sqlite3_int64 nByte = nNew * sizeof(FsdirLevel); + FsdirLevel* aNew = (FsdirLevel*)sqlite3_realloc64(pCur->aLvl, nByte); + if (aNew == 0) + return SQLITE_NOMEM; + memset(&aNew[pCur->nLvl], 0, sizeof(FsdirLevel) * (nNew - pCur->nLvl)); + pCur->aLvl = aNew; + pCur->nLvl = nNew; + } + pCur->iLvl = iNew; + pLvl = &pCur->aLvl[iNew]; + + pLvl->zDir = pCur->zPath; + pCur->zPath = 0; + pLvl->pDir = opendir(pLvl->zDir); + if (pLvl->pDir == 0) { + fsdirSetErrmsg(pCur, "cannot read directory: %s", pCur->zPath); + return SQLITE_ERROR; + } + } + + while (pCur->iLvl >= 0) { + FsdirLevel* pLvl = &pCur->aLvl[pCur->iLvl]; + struct dirent* pEntry = readdir(pLvl->pDir); + if (pEntry) { + if (pEntry->d_name[0] == '.') { + if (pEntry->d_name[1] == '.' && pEntry->d_name[2] == '\0') + continue; + if (pEntry->d_name[1] == '\0') + continue; + } + sqlite3_free(pCur->zPath); + pCur->zPath = sqlite3_mprintf("%s/%s", pLvl->zDir, pEntry->d_name); + if (pCur->zPath == 0) + return SQLITE_NOMEM; + if (fileLinkStat(pCur->zPath, &pCur->sStat)) { + fsdirSetErrmsg(pCur, "cannot stat file: %s", pCur->zPath); + return SQLITE_ERROR; + } + return SQLITE_OK; + } + closedir(pLvl->pDir); + sqlite3_free(pLvl->zDir); + pLvl->pDir = 0; + pLvl->zDir = 0; + pCur->iLvl--; + } + + /* EOF */ + sqlite3_free(pCur->zPath); + pCur->zPath = 0; + return SQLITE_OK; +} + +/* +** Return values of columns for the row at which the series_cursor +** is currently pointing. +*/ +static int fsdirColumn(sqlite3_vtab_cursor* cur, /* The cursor */ + sqlite3_context* ctx, /* First argument to sqlite3_result_...() */ + int i /* Which column to return */ +) { + fsdir_cursor* pCur = (fsdir_cursor*)cur; + switch (i) { + case FSDIR_COLUMN_NAME: { + sqlite3_result_text(ctx, pCur->zPath, -1, SQLITE_TRANSIENT); + break; + } + + case FSDIR_COLUMN_MODE: + sqlite3_result_int64(ctx, pCur->sStat.st_mode); + break; + + case FSDIR_COLUMN_MTIME: + sqlite3_result_int64(ctx, pCur->sStat.st_mtime); + break; + + case FSDIR_COLUMN_SIZE: { + sqlite3_result_int64(ctx, pCur->sStat.st_size); + break; + } + case FSDIR_COLUMN_PATH: + default: { + /* The FSDIR_COLUMN_PATH and FSDIR_COLUMN_REC are input parameters. + ** always return their values as NULL */ + break; + } + } + return SQLITE_OK; +} + +/* +** Return the rowid for the current row. In this implementation, the +** first row returned is assigned rowid value 1, and each subsequent +** row a value 1 more than that of the previous. +*/ +static int fsdirRowid(sqlite3_vtab_cursor* cur, sqlite_int64* pRowid) { + fsdir_cursor* pCur = (fsdir_cursor*)cur; + *pRowid = pCur->iRowid; + return SQLITE_OK; +} + +/* +** Return TRUE if the cursor has been moved off of the last +** row of output. +*/ +static int fsdirEof(sqlite3_vtab_cursor* cur) { + fsdir_cursor* pCur = (fsdir_cursor*)cur; + return (pCur->zPath == 0); +} + +/* +** xFilter callback. +** +** idxNum==0 PATH was not supplied (invalid function call) +** idxNum==1 PATH was supplied +*/ +static int fsdirFilter(sqlite3_vtab_cursor* cur, + int idxNum, + const char* idxStr, + int argc, + sqlite3_value** argv) { + fsdir_cursor* pCur = (fsdir_cursor*)cur; + (void)idxStr; + fsdirResetCursor(pCur); + + if (idxNum == 0) { + fsdirSetErrmsg(pCur, "table function lsdir requires an argument"); + return SQLITE_ERROR; + } + + assert(idxNum == 1 && (argc == 1 || argc == 2)); + const char* zPath = (const char*)sqlite3_value_text(argv[0]); + if (zPath == 0) { + fsdirSetErrmsg(pCur, "table function lsdir requires a non-NULL argument"); + return SQLITE_ERROR; + } + pCur->zPath = sqlite3_mprintf("%s", zPath); + + bool recursive = false; + if (argc == 2) { + recursive = (bool)sqlite3_value_int(argv[1]); + } + pCur->recursive = recursive; + + if (pCur->zPath == 0) { + return SQLITE_NOMEM; + } + if (fileLinkStat(pCur->zPath, &pCur->sStat)) { + // file does not exist, terminate via subsequent call to fsdirEof + pCur->zPath = 0; + } + + return SQLITE_OK; +} + +/* +** SQLite will invoke this method one or more times while planning a query +** that uses the generate_series virtual table. This routine needs to create +** a query plan for each invocation and compute an estimated cost for that +** plan. +** +** In this implementation idxNum is used to represent the +** query plan. idxStr is unused. +** +** The query plan is represented by values of idxNum: +** +** (1) The path value is supplied by argv[0] +*/ +static int fsdirBestIndex(sqlite3_vtab* tab, sqlite3_index_info* pIdxInfo) { + int i; /* Loop over constraints */ + int idxPath = -1; /* Index in pIdxInfo->aConstraint of PATH= */ + int idxRec = -1; /* Index in pIdxInfo->aConstraint of REC= */ + int seenPath = 0; /* True if an unusable PATH= constraint is seen */ + int seenRec = 0; /* True if an unusable REC= constraint is seen */ + const struct sqlite3_index_constraint* pConstraint; + + (void)tab; + pConstraint = pIdxInfo->aConstraint; + for (i = 0; i < pIdxInfo->nConstraint; i++, pConstraint++) { + if (pConstraint->op != SQLITE_INDEX_CONSTRAINT_EQ) + continue; + switch (pConstraint->iColumn) { + case FSDIR_COLUMN_PATH: { + if (pConstraint->usable) { + idxPath = i; + seenPath = 0; + } else if (idxPath < 0) { + seenPath = 1; + } + break; + } + case FSDIR_COLUMN_REC: { + if (pConstraint->usable) { + idxRec = i; + seenRec = 0; + } else if (idxRec < 0) { + seenRec = 1; + } + break; + } + } + } + if (seenPath || seenRec) { + /* If input parameters are unusable, disallow this plan */ + return SQLITE_CONSTRAINT; + } + + if (idxPath < 0) { + pIdxInfo->idxNum = 0; + /* The pIdxInfo->estimatedCost should have been initialized to a huge + ** number. Leave it unchanged. */ + pIdxInfo->estimatedRows = 0x7fffffff; + } else { + pIdxInfo->aConstraintUsage[idxPath].omit = 1; + pIdxInfo->aConstraintUsage[idxPath].argvIndex = 1; + if (idxRec >= 0) { + pIdxInfo->aConstraintUsage[idxRec].omit = 1; + pIdxInfo->aConstraintUsage[idxRec].argvIndex = 2; + } + pIdxInfo->idxNum = 1; + pIdxInfo->estimatedCost = 100.0; + } + + return SQLITE_OK; +} + +static sqlite3_module ls_module = { + .xConnect = fsdirConnect, + .xBestIndex = fsdirBestIndex, + .xDisconnect = fsdirDisconnect, + .xOpen = fsdirOpen, + .xClose = fsdirClose, + .xFilter = fsdirFilter, + .xNext = fsdirNext, + .xEof = fsdirEof, + .xColumn = fsdirColumn, + .xRowid = fsdirRowid, +}; + +int fileio_ls_init(sqlite3* db) { + sqlite3_create_module(db, "fileio_ls", &ls_module, 0); + sqlite3_create_module(db, "lsdir", &ls_module, 0); + return SQLITE_OK; +} + +int fileio_scalar_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_DIRECTONLY; + sqlite3_create_function(db, "fileio_mode", 1, SQLITE_UTF8, 0, fileio_lsmode, 0, 0); + sqlite3_create_function(db, "lsmode", 1, SQLITE_UTF8, 0, fileio_lsmode, 0, 0); + + sqlite3_create_function(db, "fileio_mkdir", -1, flags, 0, fileio_mkdir, 0, 0); + sqlite3_create_function(db, "mkdir", -1, flags, 0, fileio_mkdir, 0, 0); + + sqlite3_create_function(db, "fileio_read", -1, flags, 0, fileio_readfile, 0, 0); + sqlite3_create_function(db, "readfile", -1, flags, 0, fileio_readfile, 0, 0); + + sqlite3_create_function(db, "fileio_symlink", 2, flags, 0, fileio_symlink, 0, 0); + sqlite3_create_function(db, "symlink", 2, flags, 0, fileio_symlink, 0, 0); + + sqlite3_create_function(db, "fileio_write", -1, flags, 0, fileio_writefile, 0, 0); + sqlite3_create_function(db, "writefile", -1, flags, 0, fileio_writefile, 0, 0); + + sqlite3_create_function(db, "fileio_append", 2, flags, 0, fileio_append, 0, 0); + return SQLITE_OK; +} + +#if defined(FILEIO_WIN32_DLL) && (defined(_WIN32) || defined(WIN32)) +/* To allow a standalone DLL, make test_windirent.c use the same + * redefined SQLite API calls as the above extension code does. + * Just pull in this .c to accomplish this. As a beneficial side + * effect, this extension becomes a single translation unit. */ + +/* +** Implementation of the POSIX getenv() function using the Win32 API. +** This function is not thread-safe. +*/ +const char* windirent_getenv(const char* name) { + static char value[32768]; /* Maximum length, per MSDN */ + DWORD dwSize = sizeof(value) / sizeof(char); /* Size in chars */ + DWORD dwRet; /* Value returned by GetEnvironmentVariableA() */ + + memset(value, 0, sizeof(value)); + dwRet = GetEnvironmentVariableA(name, value, dwSize); + if (dwRet == 0 || dwRet > dwSize) { + /* + ** The function call to GetEnvironmentVariableA() failed -OR- + ** the buffer is not large enough. Either way, return NULL. + */ + return 0; + } else { + /* + ** The function call to GetEnvironmentVariableA() succeeded + ** -AND- the buffer contains the entire value. + */ + return value; + } +} + +/* +** Implementation of the POSIX opendir() function using the MSVCRT. +*/ +LPDIR opendir(const char* dirname) { + struct _finddata_t data; + LPDIR dirp = (LPDIR)sqlite3_malloc(sizeof(DIR)); + SIZE_T namesize = sizeof(data.name) / sizeof(data.name[0]); + + if (dirp == NULL) + return NULL; + memset(dirp, 0, sizeof(DIR)); + + /* TODO: Remove this if Unix-style root paths are not used. */ + if (sqlite3_stricmp(dirname, "/") == 0) { + dirname = windirent_getenv("SystemDrive"); + } + + memset(&data, 0, sizeof(struct _finddata_t)); + _snprintf(data.name, namesize, "%s\\*", dirname); + dirp->d_handle = _findfirst(data.name, &data); + + if (dirp->d_handle == BAD_INTPTR_T) { + closedir(dirp); + return NULL; + } + + /* TODO: Remove this block to allow hidden and/or system files. */ + if (is_filtered(data)) { + next: + + memset(&data, 0, sizeof(struct _finddata_t)); + if (_findnext(dirp->d_handle, &data) == -1) { + closedir(dirp); + return NULL; + } + + /* TODO: Remove this block to allow hidden and/or system files. */ + if (is_filtered(data)) + goto next; + } + + dirp->d_first.d_attributes = data.attrib; + strncpy(dirp->d_first.d_name, data.name, NAME_MAX); + dirp->d_first.d_name[NAME_MAX] = '\0'; + + return dirp; +} + +/* +** Implementation of the POSIX readdir() function using the MSVCRT. +*/ +LPDIRENT readdir(LPDIR dirp) { + struct _finddata_t data; + + if (dirp == NULL) + return NULL; + + if (dirp->d_first.d_ino == 0) { + dirp->d_first.d_ino++; + dirp->d_next.d_ino++; + + return &dirp->d_first; + } + +next: + + memset(&data, 0, sizeof(struct _finddata_t)); + if (_findnext(dirp->d_handle, &data) == -1) + return NULL; + + /* TODO: Remove this block to allow hidden and/or system files. */ + if (is_filtered(data)) + goto next; + + dirp->d_next.d_ino++; + dirp->d_next.d_attributes = data.attrib; + strncpy(dirp->d_next.d_name, data.name, NAME_MAX); + dirp->d_next.d_name[NAME_MAX] = '\0'; + + return &dirp->d_next; +} + +/* +** Implementation of the POSIX readdir_r() function using the MSVCRT. +*/ +INT readdir_r(LPDIR dirp, LPDIRENT entry, LPDIRENT* result) { + struct _finddata_t data; + + if (dirp == NULL) + return EBADF; + + if (dirp->d_first.d_ino == 0) { + dirp->d_first.d_ino++; + dirp->d_next.d_ino++; + + entry->d_ino = dirp->d_first.d_ino; + entry->d_attributes = dirp->d_first.d_attributes; + strncpy(entry->d_name, dirp->d_first.d_name, NAME_MAX); + entry->d_name[NAME_MAX] = '\0'; + + *result = entry; + return 0; + } + +next: + + memset(&data, 0, sizeof(struct _finddata_t)); + if (_findnext(dirp->d_handle, &data) == -1) { + *result = NULL; + return ENOENT; + } + + /* TODO: Remove this block to allow hidden and/or system files. */ + if (is_filtered(data)) + goto next; + + entry->d_ino = (ino_t)-1; /* not available */ + entry->d_attributes = data.attrib; + strncpy(entry->d_name, data.name, NAME_MAX); + entry->d_name[NAME_MAX] = '\0'; + + *result = entry; + return 0; +} + +/* +** Implementation of the POSIX closedir() function using the MSVCRT. +*/ +INT closedir(LPDIR dirp) { + INT result = 0; + + if (dirp == NULL) + return EINVAL; + + if (dirp->d_handle != NULL_INTPTR_T && dirp->d_handle != BAD_INTPTR_T) { + result = _findclose(dirp->d_handle); + } + + sqlite3_free(dirp); + return result; +} +#endif diff --git a/libsql-ffi/bundled/sqlean/fileio/scan.c b/libsql-ffi/bundled/sqlean/fileio/scan.c new file mode 100644 index 0000000000..c57c7ec1c3 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fileio/scan.c @@ -0,0 +1,300 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// scanfile(name) +// Reads a file with the specified name line by line. +// Implemented as a table-valued function. + +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#else +#include +#endif + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +/* + * readline reads chars from the input `stream` until it encounters \n char. + * Returns the number or characters read. + * + * `lineptr` points to the first character read. + * `n` equals the current buffer size. + */ +static ssize_t readline(char** lineptr, size_t* n, FILE* stream) { + char* bufptr = NULL; + char* p = bufptr; + size_t size; + int c; + + if (lineptr == NULL) { + return -1; + } + if (stream == NULL) { + return -1; + } + if (n == NULL) { + return -1; + } + bufptr = *lineptr; + size = *n; + + c = fgetc(stream); + if (c == EOF) { + return -1; + } + if (bufptr == NULL) { + bufptr = malloc(128); + if (bufptr == NULL) { + return -1; + } + size = 128; + } + p = bufptr; + while (c != EOF) { + if ((ssize_t)(p - bufptr) > (ssize_t)(size - 1)) { + size = size + 128; + bufptr = realloc(bufptr, size); + if (bufptr == NULL) { + return -1; + } + } + *p++ = c; + if (c == '\n') { + break; + } + c = fgetc(stream); + } + + *p++ = '\0'; + *lineptr = bufptr; + *n = size; + + return p - bufptr - 1; +} + +typedef struct { + sqlite3_vtab base; +} Table; + +typedef struct { + sqlite3_vtab_cursor base; + const char* name; + FILE* in; + bool eof; + char* line; + sqlite3_int64 rowid; +} Cursor; + +#define COLUMN_ROWID -1 +#define COLUMN_VALUE 0 +#define COLUMN_NAME 1 + +// xconnect creates the virtual table. +static int xconnect(sqlite3* db, + void* aux, + int argc, + const char* const* argv, + sqlite3_vtab** vtabptr, + char** errptr) { + (void)aux; + (void)argc; + (void)argv; + (void)errptr; + + int rc = sqlite3_declare_vtab(db, "CREATE TABLE x(value text, name hidden)"); + if (rc != SQLITE_OK) { + return rc; + } + + Table* table = sqlite3_malloc(sizeof(*table)); + *vtabptr = (sqlite3_vtab*)table; + if (table == NULL) { + return SQLITE_NOMEM; + } + memset(table, 0, sizeof(*table)); + sqlite3_vtab_config(db, SQLITE_VTAB_DIRECTONLY); + return SQLITE_OK; +} + +// xdisconnect destroys the virtual table. +static int xdisconnect(sqlite3_vtab* vtable) { + Table* table = (Table*)vtable; + sqlite3_free(table); + return SQLITE_OK; +} + +// xopen creates a new cursor. +static int xopen(sqlite3_vtab* vtable, sqlite3_vtab_cursor** curptr) { + (void)vtable; + Cursor* cursor = sqlite3_malloc(sizeof(*cursor)); + if (cursor == NULL) { + return SQLITE_NOMEM; + } + memset(cursor, 0, sizeof(*cursor)); + *curptr = &cursor->base; + return SQLITE_OK; +} + +// xclose destroys the cursor. +static int xclose(sqlite3_vtab_cursor* cur) { + Cursor* cursor = (Cursor*)cur; + if (cursor->in != NULL) { + fclose(cursor->in); + } + if (cursor->line != NULL) { + free(cursor->line); + } + sqlite3_free(cur); + return SQLITE_OK; +} + +// xnext advances the cursor to its next row of output. +static int xnext(sqlite3_vtab_cursor* cur) { + Cursor* cursor = (Cursor*)cur; + cursor->rowid++; + size_t bufsize = 0; + ssize_t len = readline(&cursor->line, &bufsize, cursor->in); + if (len == -1) { + cursor->eof = true; + } + if (len >= 1 && cursor->line[len - 1] == '\n') { + cursor->line[len - 1] = '\0'; + } + if (len >= 2 && cursor->line[len - 2] == '\r') { + cursor->line[len - 2] = '\0'; + } + return SQLITE_OK; +} + +// xcolumn returns the current cursor value. +static int xcolumn(sqlite3_vtab_cursor* cur, sqlite3_context* ctx, int col_idx) { + (void)col_idx; + Cursor* cursor = (Cursor*)cur; + switch (col_idx) { + case COLUMN_VALUE: + sqlite3_result_text(ctx, (const char*)cursor->line, -1, SQLITE_TRANSIENT); + break; + + case COLUMN_NAME: + sqlite3_result_text(ctx, cursor->name, -1, SQLITE_TRANSIENT); + break; + + default: + break; + } + return SQLITE_OK; +} + +// xrowid returns the rowid for the current row. +static int xrowid(sqlite3_vtab_cursor* cur, sqlite_int64* rowid_ptr) { + Cursor* cursor = (Cursor*)cur; + *rowid_ptr = cursor->rowid; + return SQLITE_OK; +} + +// xeof returns TRUE if the cursor has been moved off of the last row of output. +static int xeof(sqlite3_vtab_cursor* cur) { + Cursor* cursor = (Cursor*)cur; + return cursor->eof; +} + +// xfilter rewinds the cursor back to the first row of output. +static int xfilter(sqlite3_vtab_cursor* cur, + int idx_num, + const char* idx_str, + int argc, + sqlite3_value** argv) { + (void)idx_num; + (void)idx_str; + + if (argc != 1) { + return SQLITE_ERROR; + } + const char* name = (const char*)sqlite3_value_text(argv[0]); + + Cursor* cursor = (Cursor*)cur; + sqlite3_vtab* vtable = (cursor->base).pVtab; + + // free resources from the previous file, if any + if (cursor->in != NULL) { + fclose(cursor->in); + } + if (cursor->line != NULL) { + free(cursor->line); + } + + // reset the cursor + cursor->name = name; + cursor->eof = false; + cursor->line = NULL; + cursor->rowid = 0; + + cursor->in = fopen(cursor->name, "r"); + if (cursor->in == NULL) { + vtable->zErrMsg = sqlite3_mprintf("cannot open '%s' for reading", cursor->name); + return SQLITE_ERROR; + } + + return xnext(cur); +} + +// xbest_index instructs SQLite to pass certain arguments to xFilter. +static int xbest_index(sqlite3_vtab* vtable, sqlite3_index_info* index_info) { + // for (size_t i = 0; i < index_info->nConstraint; i++) { + // const struct sqlite3_index_constraint* constraint = index_info->aConstraint + i; + // printf("i=%zu iColumn=%d, op=%d, usable=%d\n", i, constraint->iColumn, constraint->op, + // constraint->usable); + // } + + // only the name argument is supported + if (index_info->nConstraint != 1) { + vtable->zErrMsg = sqlite3_mprintf("scanfile() expects a single constraint (name)"); + return SQLITE_ERROR; + } + + const struct sqlite3_index_constraint* constraint = index_info->aConstraint; + if (constraint->iColumn != COLUMN_NAME) { + vtable->zErrMsg = sqlite3_mprintf("scanfile() expects a name constraint)"); + return SQLITE_ERROR; + } + + if (constraint->usable == 0) { + // unusable contraint + return SQLITE_CONSTRAINT; + } + + // pass the name argument to xFilter + index_info->aConstraintUsage[0].argvIndex = COLUMN_NAME; + index_info->aConstraintUsage[0].omit = 1; + index_info->estimatedCost = (double)1000; + index_info->estimatedRows = 1000; + return SQLITE_OK; +} + +static sqlite3_module scan_module = { + .xConnect = xconnect, + .xBestIndex = xbest_index, + .xDisconnect = xdisconnect, + .xOpen = xopen, + .xClose = xclose, + .xFilter = xfilter, + .xNext = xnext, + .xEof = xeof, + .xColumn = xcolumn, + .xRowid = xrowid, +}; + +int fileio_scan_init(sqlite3* db) { + sqlite3_create_module(db, "fileio_scan", &scan_module, 0); + sqlite3_create_module(db, "scanfile", &scan_module, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/caver.c b/libsql-ffi/bundled/sqlean/fuzzy/caver.c new file mode 100644 index 0000000000..b6edfaa2f2 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/caver.c @@ -0,0 +1,323 @@ +// Copyright (c) 2021 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Caverphone phonetic coding algorithm. +// https://en.wikipedia.org/wiki/Caverphone + +#include +#include +#include + +// remove_non_letters deletes everything from the source string, +// except lowercased letters a-z +static char* remove_non_letters(const char* src) { + size_t src_len = strlen(src); + char* res = malloc((src_len + 1) * sizeof(char)); + const char* src_it; + char* res_it = res; + for (size_t idx = 0; idx < src_len; idx++) { + src_it = src + idx; + if (*src_it < 97 || *src_it > 122) { + continue; + } + *res_it = *src_it; + res_it++; + } + *res_it = '\0'; + return res; +} + +// replace_start replaces the `old` substring with the `new` one +// if it matches at the beginning of the `src` string +static char* replace_start(const char* src, const char* old, const char* new) { + size_t src_len = strlen(src); + size_t old_len = strlen(old); + size_t new_len = strlen(new); + assert(new_len <= old_len); + + char* res = malloc((src_len + 1) * sizeof(char)); + + if (src_len < old_len) { + // source string is shorter than the substring to replace, + // so there is definitely no match + strcpy(res, src); + return res; + } + + if (strncmp(src, old, old_len) == 0) { + strncpy(res, new, new_len); + strncpy(res + new_len, src + old_len, src_len - old_len); + *(res + src_len - old_len + new_len) = '\0'; + } else { + strcpy(res, src); + } + return res; +} + +// replace_end replaces the `old` substring with the `new` one +// if it matches at the end of the `src` string +static char* replace_end(const char* src, const char* old, const char* new) { + size_t src_len = strlen(src); + size_t old_len = strlen(old); + size_t new_len = strlen(new); + assert(new_len <= old_len); + + char* res = malloc((src_len + 1) * sizeof(char)); + + if (src_len < old_len) { + // source string is shorter than the substring to replace, + // so there is definitely no match + strcpy(res, src); + return res; + } + + strncpy(res, src, src_len - old_len); + if (strncmp(src + src_len - old_len, old, old_len) == 0) { + strncpy(res + src_len - old_len, new, new_len); + *(res + src_len - old_len + new_len) = '\0'; + } else { + strncpy(res + src_len - old_len, src + src_len - old_len, old_len); + *(res + src_len) = '\0'; + } + return res; +} + +// replace replaces all `old` substrings with `new` ones +// in the the `src` string +static char* replace(const char* src, const char* old, const char* new) { + size_t src_len = strlen(src); + size_t old_len = strlen(old); + size_t new_len = strlen(new); + assert(new_len <= old_len); + + char* res = malloc((src_len + 1) * sizeof(char)); + + if (src_len < old_len) { + // source string is shorter than the substring to replace, + // so there is definitely no match + strcpy(res, src); + return res; + } + + const char* src_it; + char* res_it = res; + for (size_t idx = 0; idx < src_len;) { + src_it = src + idx; + if (strncmp(src_it, old, old_len) == 0) { + strncpy(res_it, new, new_len); + res_it += new_len; + idx += old_len; + } else { + *res_it = *src_it; + res_it++; + idx++; + } + } + *res_it = '\0'; + return res; +} + +// replace_seq replaces all sequences of the `old` character +// with the `new` substring in the the `src` string +static char* replace_seq(const char* src, const char old, const char* new) { + size_t src_len = strlen(src); + size_t new_len = strlen(new); + char* res = malloc((src_len + 1) * sizeof(char)); + const char* src_it; + char* res_it = res; + size_t match_len = 0; + for (size_t idx = 0; idx < src_len;) { + src_it = src + idx; + if (*src_it == old) { + match_len++; + idx++; + } else { + if (match_len > 0) { + strncpy(res_it, new, new_len); + res_it += new_len; + match_len = 0; + } + *res_it = *src_it; + res_it++; + idx++; + } + } + if (match_len > 0) { + strncpy(res_it, new, new_len); + res_it += new_len; + } + *res_it = '\0'; + return res; +} + +// pad pads `src` string with trailing 1s +// up to the length of 10 characters +static char* pad(const char* src) { + size_t src_len = strlen(src); + size_t max_len = 10; + + char* res = malloc((max_len + 1) * sizeof(char)); + strncpy(res, src, max_len); + if (src_len < max_len) { + for (size_t idx = src_len; idx < max_len; idx++) { + *(res + idx) = '1'; + } + } + *(res + max_len) = '\0'; + return res; +} + +// step frees the source string and returns the result one +static char* step(char* res, char* src) { + free(src); + return res; +} + +// caverphone implements the Caverphone phonetic hashing algorithm +// as described in https://caversham.otago.ac.nz/files/working/ctp150804.pdf +char* caverphone(const char* src) { + assert(src != NULL); + + char* res = malloc((strlen(src) + 1) * sizeof(char)); + + if (src == 0 || *src == '\0') { + res[0] = '\0'; + return res; + } + + strcpy(res, src); + + // Remove anything not in the standard alphabet + res = step(remove_non_letters((const char*)res), res); + + // Remove final e + res = step(replace_end((const char*)res, "e", ""), res); + + // If the name starts with *gh make it *2f + res = step(replace_start((const char*)res, "cough", "cou2f"), res); + res = step(replace_start((const char*)res, "rough", "rou2f"), res); + res = step(replace_start((const char*)res, "tough", "tou2f"), res); + res = step(replace_start((const char*)res, "enough", "enou2f"), res); + res = step(replace_start((const char*)res, "trough", "trou2f"), res); + + // If the name starts with gn make it 2n + res = step(replace_start((const char*)res, "gn", "2n"), res); + // If the name ends with mb make it m2 + res = step(replace_end((const char*)res, "mb", "m2"), res); + // replace cq with 2q + res = step(replace((const char*)res, "cq", "2q"), res); + + // replace c[iey] with s[iey] + res = step(replace((const char*)res, "ci", "si"), res); + res = step(replace((const char*)res, "ce", "se"), res); + res = step(replace((const char*)res, "cy", "sy"), res); + + // replace tch with 2ch + res = step(replace((const char*)res, "tch", "2ch"), res); + + // replace [cqx] with k + res = step(replace((const char*)res, "c", "k"), res); + res = step(replace((const char*)res, "q", "k"), res); + res = step(replace((const char*)res, "x", "k"), res); + + // replace v with f + res = step(replace((const char*)res, "v", "f"), res); + // replace dg with 2g + res = step(replace((const char*)res, "dg", "2g"), res); + + // replace ti[oa] with si[oa] + res = step(replace((const char*)res, "tio", "sio"), res); + res = step(replace((const char*)res, "tia", "sia"), res); + + // replace d with t + res = step(replace((const char*)res, "d", "t"), res); + // replace ph with fh + res = step(replace((const char*)res, "ph", "fh"), res); + // replace b with p + res = step(replace((const char*)res, "b", "p"), res); + // replace sh with s2 + res = step(replace((const char*)res, "sh", "s2"), res); + // replace z with s + res = step(replace((const char*)res, "z", "s"), res); + + // replace an initial vowel [aeiou] with an A + res = step(replace_start((const char*)res, "a", "A"), res); + res = step(replace_start((const char*)res, "e", "A"), res); + res = step(replace_start((const char*)res, "i", "A"), res); + res = step(replace_start((const char*)res, "o", "A"), res); + res = step(replace_start((const char*)res, "u", "A"), res); + + // replace all other vowels with a 3 + res = step(replace((const char*)res, "a", "3"), res); + res = step(replace((const char*)res, "e", "3"), res); + res = step(replace((const char*)res, "i", "3"), res); + res = step(replace((const char*)res, "o", "3"), res); + res = step(replace((const char*)res, "u", "3"), res); + + // replace j with y + res = step(replace((const char*)res, "j", "y"), res); + + // replace an initial y3 with Y3 + res = step(replace_start((const char*)res, "y3", "Y3"), res); + // replace an initial y with A + res = step(replace_start((const char*)res, "y", "A"), res); + // replace y with 3 + res = step(replace((const char*)res, "y", "3"), res); + + // replace 3gh3 with 3kh3 + res = step(replace((const char*)res, "3gh3", "3kh3"), res); + // replace gh with 22 + res = step(replace((const char*)res, "gh", "22"), res); + // replace g with k + res = step(replace((const char*)res, "g", "k"), res); + + // replace sequence of the letter [stpkfmn] with an uppercased letter + res = step(replace_seq((const char*)res, 's', "S"), res); + res = step(replace_seq((const char*)res, 't', "T"), res); + res = step(replace_seq((const char*)res, 'p', "P"), res); + res = step(replace_seq((const char*)res, 'k', "K"), res); + res = step(replace_seq((const char*)res, 'f', "F"), res); + res = step(replace_seq((const char*)res, 'm', "M"), res); + res = step(replace_seq((const char*)res, 'n', "N"), res); + + // replace w3 with W3 + res = step(replace((const char*)res, "w3", "W3"), res); + // replace wh3 with Wh3 + res = step(replace((const char*)res, "wh3", "Wh3"), res); + // replace the final w with 3 + res = step(replace_end((const char*)res, "w", "3"), res); + // replace w with 2 + res = step(replace((const char*)res, "w", "2"), res); + + // replace an initial h with an A + res = step(replace_start((const char*)res, "h", "A"), res); + // replace all other occurrences of h with a 2 + res = step(replace((const char*)res, "h", "2"), res); + + // replace r3 with R3 + res = step(replace((const char*)res, "r3", "R3"), res); + // replace the final r with 3 + res = step(replace_end((const char*)res, "r", "3"), res); + // replace r with 2 + res = step(replace((const char*)res, "r", "2"), res); + + // replace l3 with L3 + res = step(replace((const char*)res, "l3", "L3"), res); + // replace the final l with 3 + res = step(replace_end((const char*)res, "l", "3"), res); + // replace l with 2 + res = step(replace((const char*)res, "l", "2"), res); + + // remove all 2s + res = step(replace((const char*)res, "2", ""), res); + // replace the final 3 with A + res = step(replace_end((const char*)res, "3", "A"), res); + // remove all 3s + res = step(replace((const char*)res, "3", ""), res); + + // put ten 1s on the end + // take the first ten characters as the code + res = step(pad((const char*)res), res); + + return res; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/common.c b/libsql-ffi/bundled/sqlean/fuzzy/common.c new file mode 100644 index 0000000000..4beca4403b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/common.c @@ -0,0 +1,112 @@ +// Originally from the spellfix SQLite exension, Public Domain +// https://www.sqlite.org/src/file/ext/misc/spellfix.c +// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License + +#include "fuzzy/common.h" + +/* +** The following table gives the character class for non-initial ASCII +** characters. +*/ +const unsigned char midClass[] = { + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE, + /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER, + /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER, + /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER, + /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER, + /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER, + /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT, + /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT, + /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT, + /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER, + /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER, + /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL, + /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D, + /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C, + /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C, + /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M, + /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B, + /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C, + /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B, + /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL, + /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER, + /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER, + /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B, + /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL, + /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT, + /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C, + /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M, + /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C, + /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D, + /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B, + /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C, + /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER, + /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER, +}; +/* +** This tables gives the character class for ASCII characters that form the +** initial character of a word. The only difference from midClass is with +** the letters H, W, and Y. +*/ +const unsigned char initClass[] = { + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, + /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE, + /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER, + /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER, + /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER, + /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER, + /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER, + /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT, + /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT, + /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT, + /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER, + /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER, + /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL, + /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D, + /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C, + /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C, + /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M, + /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B, + /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C, + /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B, + /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y, + /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER, + /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER, + /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B, + /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL, + /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT, + /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C, + /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M, + /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C, + /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D, + /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B, + /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C, + /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER, + /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER, +}; + +/* +** Mapping from the character class number (0-13) to a symbol for each +** character class. Note that initClass[] can be used to map the class +** symbol back into the class number. +*/ +const unsigned char className[] = ".ABCDHLRMY9 ?"; diff --git a/libsql-ffi/bundled/sqlean/fuzzy/common.h b/libsql-ffi/bundled/sqlean/fuzzy/common.h new file mode 100644 index 0000000000..a548ec94e2 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/common.h @@ -0,0 +1,61 @@ +// Adapted from the spellfix SQLite exension, Public Domain +// https://www.sqlite.org/src/file/ext/misc/spellfix.c + +#ifndef COMMON_H +#define COMMON_H + +/* +** Character classes for ASCII characters: +** +** 0 '' Silent letters: H W +** 1 'A' Any vowel: A E I O U (Y) +** 2 'B' A bilabeal stop or fricative: B F P V W +** 3 'C' Other fricatives or back stops: C G J K Q S X Z +** 4 'D' Alveolar stops: D T +** 5 'H' Letter H at the beginning of a word +** 6 'L' Glide: L +** 7 'R' Semivowel: R +** 8 'M' Nasals: M N +** 9 'Y' Letter Y at the beginning of a word. +** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9 +** 11 ' ' White space +** 12 '?' Other. +*/ +#define CCLASS_SILENT 0 +#define CCLASS_VOWEL 1 +#define CCLASS_B 2 +#define CCLASS_C 3 +#define CCLASS_D 4 +#define CCLASS_H 5 +#define CCLASS_L 6 +#define CCLASS_R 7 +#define CCLASS_M 8 +#define CCLASS_Y 9 +#define CCLASS_DIGIT 10 +#define CCLASS_SPACE 11 +#define CCLASS_OTHER 12 + +#define SCRIPT_LATIN 0x0001 +#define SCRIPT_CYRILLIC 0x0002 +#define SCRIPT_GREEK 0x0004 +#define SCRIPT_HEBREW 0x0008 +#define SCRIPT_ARABIC 0x0010 + +#define ALWAYS(X) 1 +#define NEVER(X) 0 + +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#define EQ(a, b) ((a) == (b)) +#define NOT_EQ(a, b) !EQ(a, b) + +#define MIN(a, b) ((a) < (b)) ? (a) : (b) +#define MIN3(a, b, c) MIN(MIN(a, b), c) +#define MIN4(a, b, c, d) MIN(MIN(a, b), MIN(c, d)) + +#define MAX(a, b) ((a) > (b)) ? (a) : (b) +#define MAX3(a, b, c) MAX(MAX(a, b), c) +#define MAX4(a, b, c, d) MAX(MAX(a, b), MAX(b, c)) + +#endif diff --git a/libsql-ffi/bundled/sqlean/fuzzy/damlev.c b/libsql-ffi/bundled/sqlean/fuzzy/damlev.c new file mode 100644 index 0000000000..093c7c1220 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/damlev.c @@ -0,0 +1,104 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include + +#include "fuzzy/common.h" + +/// Calculates and returns the Damerau-Levenshtein distance of two non NULL +/// strings. More information about the algorithm can be found here: +/// https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance +/// +/// @param str1 first non NULL string +/// @param str2 second non NULL string +/// +/// @returns Damerau-Levenshtein distance of str1 and str2 +unsigned damerau_levenshtein(const char* str1, const char* str2) { + // strings cannot be NULL + assert(str1 != NULL); + assert(str2 != NULL); + + // size of the alphabet + const unsigned alpha_size = 255; + + size_t str1_len = strlen(str1); + size_t str2_len = strlen(str2); + + // handle cases where one or both strings are empty + if (str1_len == 0) { + return str2_len; + } + if (str2_len == 0) { + return str1_len; + } + + // remove common substring + while (str1_len > 0 && str2_len > 0 && EQ(str1[0], str2[0])) { + str1++, str2++; + str1_len--, str2_len--; + } + + const unsigned INFINITY = str1_len + str2_len; + unsigned row, col; + + // create "dictionary" + unsigned* dict = calloc(alpha_size, sizeof(unsigned)); + + size_t m_rows = str1_len + 2; // matrix rows + size_t m_cols = str2_len + 2; // matrix cols + + // matrix to hold computed values + unsigned** matrix = malloc(m_rows * sizeof(unsigned*)); + for (unsigned i = 0; i < m_rows; i++) { + matrix[i] = calloc(m_cols, sizeof(unsigned)); + } + + // set all the starting values and add all characters to the dict + matrix[0][0] = INFINITY; + for (row = 1; row < m_rows; row++) { + matrix[row][0] = INFINITY; + matrix[row][1] = row - 1; + } + for (col = 1; col < m_cols; col++) { + matrix[0][col] = INFINITY; + matrix[1][col] = col - 1; + } + + unsigned db; + unsigned i, k; + unsigned cost; + + // fill in the matrix + for (row = 1; row <= str1_len; row++) { + db = 0; + + for (col = 1; col <= str2_len; col++) { + i = dict[(unsigned)str2[col - 1]]; + k = db; + cost = EQ(str1[row - 1], str2[col - 1]) ? 0 : 1; + + if (cost == 0) { + db = col; + } + + matrix[row + 1][col + 1] = + MIN4(matrix[row][col] + cost, matrix[row + 1][col] + 1, matrix[row][col + 1] + 1, + matrix[i][k] + (row - i - 1) + (col - k - 1) + 1); + } + + dict[(unsigned)str1[row - 1]] = row; + } + + unsigned result = matrix[m_rows - 1][m_cols - 1]; + + // free allocated memory + free(dict); + for (unsigned i = 0; i < m_rows; i++) { + free(matrix[i]); + } + free(matrix); + + return result; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/editdist.c b/libsql-ffi/bundled/sqlean/fuzzy/editdist.c new file mode 100644 index 0000000000..934812b93e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/editdist.c @@ -0,0 +1,273 @@ +// Originally from the spellfix SQLite exension, Public Domain +// https://www.sqlite.org/src/file/ext/misc/spellfix.c +// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License + +#include +#include + +#include "fuzzy/common.h" + +extern const unsigned char midClass[]; +extern const unsigned char initClass[]; +extern const unsigned char className[]; + +/* +** Return the character class number for a character given its +** context. +*/ +static char characterClass(char cPrev, char c) { + return cPrev == 0 ? initClass[c & 0x7f] : midClass[c & 0x7f]; +} + +/* +** Return the cost of inserting or deleting character c immediately +** following character cPrev. If cPrev==0, that means c is the first +** character of the word. +*/ +static int insertOrDeleteCost(char cPrev, char c, char cNext) { + char classC = characterClass(cPrev, c); + char classCprev; + + if (classC == CCLASS_SILENT) { + /* Insert or delete "silent" characters such as H or W */ + return 1; + } + if (cPrev == c) { + /* Repeated characters, or miss a repeat */ + return 10; + } + if (classC == CCLASS_VOWEL && (cPrev == 'r' || cNext == 'r')) { + return 20; /* Insert a vowel before or after 'r' */ + } + classCprev = characterClass(cPrev, cPrev); + if (classC == classCprev) { + if (classC == CCLASS_VOWEL) { + /* Remove or add a new vowel to a vowel cluster */ + return 15; + } else { + /* Remove or add a consonant not in the same class */ + return 50; + } + } + + /* any other character insertion or deletion */ + return 100; +} + +/* +** Divide the insertion cost by this factor when appending to the +** end of the word. +*/ +#define FINAL_INS_COST_DIV 4 + +/* +** Return the cost of substituting cTo in place of cFrom assuming +** the previous character is cPrev. If cPrev==0 then cTo is the first +** character of the word. +*/ +static int substituteCost(char cPrev, char cFrom, char cTo) { + char classFrom, classTo; + if (cFrom == cTo) { + /* Exact match */ + return 0; + } + if (cFrom == (cTo ^ 0x20) && ((cTo >= 'A' && cTo <= 'Z') || (cTo >= 'a' && cTo <= 'z'))) { + /* differ only in case */ + return 0; + } + classFrom = characterClass(cPrev, cFrom); + classTo = characterClass(cPrev, cTo); + if (classFrom == classTo) { + /* Same character class */ + return 40; + } + if (classFrom >= CCLASS_B && classFrom <= CCLASS_Y && classTo >= CCLASS_B && + classTo <= CCLASS_Y) { + /* Convert from one consonant to another, but in a different class */ + return 75; + } + /* Any other subsitution */ + return 100; +} + +/* +** Given two strings zA and zB which are pure ASCII, return the cost +** of transforming zA into zB. If zA ends with '*' assume that it is +** a prefix of zB and give only minimal penalty for extra characters +** on the end of zB. +** +** Smaller numbers mean a closer match. +** +** Negative values indicate an error: +** -1 One of the inputs is NULL +** -2 Non-ASCII characters on input +** -3 Unable to allocate memory +** +** If pnMatch is not NULL, then *pnMatch is set to the number of bytes +** of zB that matched the pattern in zA. If zA does not end with a '*', +** then this value is always the number of bytes in zB (i.e. strlen(zB)). +** If zA does end in a '*', then it is the number of bytes in the prefix +** of zB that was deemed to match zA. +*/ +int edit_distance(const char* zA, const char* zB, int* pnMatch) { + int nA, nB; /* Number of characters in zA[] and zB[] */ + int xA, xB; /* Loop counters for zA[] and zB[] */ + char cA = 0, cB; /* Current character of zA and zB */ + char cAprev, cBprev; /* Previous character of zA and zB */ + char cAnext, cBnext; /* Next character in zA and zB */ + int d; /* North-west cost value */ + int dc = 0; /* North-west character value */ + int res; /* Final result */ + int* m; /* The cost matrix */ + char* cx; /* Corresponding character values */ + int* toFree = 0; /* Malloced space */ + int nMatch = 0; + int mStack[60 + 15]; /* Stack space to use if not too much is needed */ + + /* Early out if either input is NULL */ + if (zA == 0 || zB == 0) + return -1; + + /* Skip any common prefix */ + while (zA[0] && zA[0] == zB[0]) { + dc = zA[0]; + zA++; + zB++; + nMatch++; + } + if (pnMatch) + *pnMatch = nMatch; + if (zA[0] == 0 && zB[0] == 0) + return 0; + +#if 0 + printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' '); +#endif + + /* Verify input strings and measure their lengths */ + for (nA = 0; zA[nA]; nA++) { + if (zA[nA] & 0x80) + return -2; + } + for (nB = 0; zB[nB]; nB++) { + if (zB[nB] & 0x80) + return -2; + } + + /* Special processing if either string is empty */ + if (nA == 0) { + cBprev = (char)dc; + for (xB = res = 0; (cB = zB[xB]) != 0; xB++) { + res += insertOrDeleteCost(cBprev, cB, zB[xB + 1]) / FINAL_INS_COST_DIV; + cBprev = cB; + } + return res; + } + if (nB == 0) { + cAprev = (char)dc; + for (xA = res = 0; (cA = zA[xA]) != 0; xA++) { + res += insertOrDeleteCost(cAprev, cA, zA[xA + 1]); + cAprev = cA; + } + return res; + } + + /* A is a prefix of B */ + if (zA[0] == '*' && zA[1] == 0) + return 0; + + /* Allocate and initialize the Wagner matrix */ + if ((size_t)nB < (sizeof(mStack) * 4) / (sizeof(mStack[0]) * 5)) { + m = mStack; + } else { + m = toFree = malloc((nB + 1) * 5 * sizeof(m[0]) / 4); + if (m == 0) + return -3; + } + cx = (char*)&m[nB + 1]; + + /* Compute the Wagner edit distance */ + m[0] = 0; + cx[0] = (char)dc; + cBprev = (char)dc; + for (xB = 1; xB <= nB; xB++) { + cBnext = zB[xB]; + cB = zB[xB - 1]; + cx[xB] = cB; + m[xB] = m[xB - 1] + insertOrDeleteCost(cBprev, cB, cBnext); + cBprev = cB; + } + cAprev = (char)dc; + for (xA = 1; xA <= nA; xA++) { + int lastA = (xA == nA); + cA = zA[xA - 1]; + cAnext = zA[xA]; + if (cA == '*' && lastA) + break; + d = m[0]; + dc = cx[0]; + m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext); + cBprev = 0; + for (xB = 1; xB <= nB; xB++) { + int totalCost, insCost, delCost, subCost, ncx; + cB = zB[xB - 1]; + cBnext = zB[xB]; + + /* Cost to insert cB */ + insCost = insertOrDeleteCost(cx[xB - 1], cB, cBnext); + if (lastA) + insCost /= FINAL_INS_COST_DIV; + + /* Cost to delete cA */ + delCost = insertOrDeleteCost(cx[xB], cA, cBnext); + + /* Cost to substitute cA->cB */ + subCost = substituteCost(cx[xB - 1], cA, cB); + + /* Best cost */ + totalCost = insCost + m[xB - 1]; + ncx = cB; + if ((delCost + m[xB]) < totalCost) { + totalCost = delCost + m[xB]; + ncx = cA; + } + if ((subCost + d) < totalCost) { + totalCost = subCost + d; + } + +#if 0 + printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c" + " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n", + xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB, + insCost, delCost, subCost, totalCost, ncx?ncx:' '); +#endif + + /* Update the matrix */ + d = m[xB]; + dc = cx[xB]; + m[xB] = totalCost; + cx[xB] = (char)ncx; + cBprev = cB; + } + cAprev = cA; + } + + /* Free the wagner matrix and return the result */ + if (cA == '*') { + res = m[1]; + for (xB = 1; xB <= nB; xB++) { + if (m[xB] < res) { + res = m[xB]; + if (pnMatch) + *pnMatch = xB + nMatch; + } + } + } else { + res = m[nB]; + /* In the current implementation, pnMatch is always NULL if zA does + ** not end in "*" */ + assert(pnMatch == 0); + } + free(toFree); + return res; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/extension.c b/libsql-ffi/bundled/sqlean/fuzzy/extension.c new file mode 100644 index 0000000000..745c23585d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/extension.c @@ -0,0 +1,289 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Fuzzy string matching and phonetics. + +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "fuzzy/fuzzy.h" + +// is_ascii checks if the string consists of ASCII symbols only +static bool is_ascii(const unsigned char* str) { + for (int idx = 0; str[idx]; idx++) { + if (str[idx] & 0x80) { + return false; + } + } + return true; +} + +// Below are functions extracted from the +// https://github.com/Rostepher/libstrcmp/ + +// fuzzy_damlev implements Damerau-Levenshtein distance +static void fuzzy_damlev(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + const unsigned char* str1 = sqlite3_value_text(argv[0]); + const unsigned char* str2 = sqlite3_value_text(argv[1]); + if (str1 == 0 || str2 == 0) { + sqlite3_result_error(context, "arguments should not be NULL", -1); + return; + } + if (!is_ascii(str1) || !is_ascii(str2)) { + sqlite3_result_error(context, "arguments should be ASCII strings", -1); + return; + } + unsigned distance = damerau_levenshtein((const char*)str1, (const char*)str2); + sqlite3_result_int(context, distance); +} + +// fuzzy_hamming implements Hamming distance +static void fuzzy_hamming(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + const unsigned char* str1 = sqlite3_value_text(argv[0]); + const unsigned char* str2 = sqlite3_value_text(argv[1]); + if (str1 == 0 || str2 == 0) { + sqlite3_result_error(context, "arguments should not be NULL", -1); + return; + } + if (!is_ascii(str1) || !is_ascii(str2)) { + sqlite3_result_error(context, "arguments should be ASCII strings", -1); + return; + } + int distance = hamming((const char*)str1, (const char*)str2); + sqlite3_result_int(context, distance); +} + +// fuzzy_jarowin implements Jaro-Winkler distance +static void fuzzy_jarowin(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + const unsigned char* str1 = sqlite3_value_text(argv[0]); + const unsigned char* str2 = sqlite3_value_text(argv[1]); + if (str1 == 0 || str2 == 0) { + sqlite3_result_error(context, "arguments should not be NULL", -1); + return; + } + if (!is_ascii(str1) || !is_ascii(str2)) { + sqlite3_result_error(context, "arguments should be ASCII strings", -1); + return; + } + double distance = jaro_winkler((const char*)str1, (const char*)str2); + sqlite3_result_double(context, distance); +} + +// fuzzy_leven implements Levenshtein distance +static void fuzzy_leven(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + const unsigned char* str1 = sqlite3_value_text(argv[0]); + const unsigned char* str2 = sqlite3_value_text(argv[1]); + if (str1 == 0 || str2 == 0) { + sqlite3_result_error(context, "arguments should not be NULL", -1); + return; + } + if (!is_ascii(str1) || !is_ascii(str2)) { + sqlite3_result_error(context, "arguments should be ASCII strings", -1); + return; + } + unsigned distance = levenshtein((const char*)str1, (const char*)str2); + sqlite3_result_int(context, distance); +} + +// fuzzy_osadist implements Optimal String Alignment distance +static void fuzzy_osadist(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + const unsigned char* str1 = sqlite3_value_text(argv[0]); + const unsigned char* str2 = sqlite3_value_text(argv[1]); + if (str1 == 0 || str2 == 0) { + sqlite3_result_error(context, "arguments should not be NULL", -1); + return; + } + if (!is_ascii(str1) || !is_ascii(str2)) { + sqlite3_result_error(context, "arguments should be ASCII strings", -1); + return; + } + unsigned distance = optimal_string_alignment((const char*)str1, (const char*)str2); + sqlite3_result_int(context, distance); +} + +// fuzzy_soundex implements Soundex coding +static void fuzzy_soundex(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + const unsigned char* source = sqlite3_value_text(argv[0]); + if (source == 0) { + return; + } + if (!is_ascii(source)) { + sqlite3_result_error(context, "argument should be ASCII string", -1); + return; + } + char* result = soundex((const char*)source); + sqlite3_result_text(context, result, -1, free); +} + +// fuzzy_rsoundex implements Refined Soundex coding +static void fuzzy_rsoundex(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + const unsigned char* source = sqlite3_value_text(argv[0]); + if (source == 0) { + return; + } + if (!is_ascii(source)) { + sqlite3_result_error(context, "argument should be ASCII string", -1); + return; + } + char* result = refined_soundex((const char*)source); + sqlite3_result_text(context, result, -1, free); +} + +// Below are functions extracted from the spellfix SQLite exension +// https://www.sqlite.org/src/file/ext/misc/spellfix.c + +/* +** fuzzy_phonetic(X) +** +** Generate a "phonetic hash" from a string of ASCII characters in X. +** +** * Map characters by character class as defined above. +** * Omit double-letters +** * Omit vowels beside R and L +** * Omit T when followed by CH +** * Omit W when followed by R +** * Omit D when followed by J or G +** * Omit K in KN or G in GN at the beginning of a word +** +** Space to hold the result is obtained from sqlite3_malloc() +** +** Return NULL if memory allocation fails. +*/ +static void fuzzy_phonetic(sqlite3_context* context, int argc, sqlite3_value** argv) { + const unsigned char* zIn; + unsigned char* zOut; + + zIn = sqlite3_value_text(argv[0]); + if (zIn == 0) + return; + zOut = phonetic_hash(zIn, sqlite3_value_bytes(argv[0])); + if (zOut == 0) { + sqlite3_result_error_nomem(context); + } else { + sqlite3_result_text(context, (char*)zOut, -1, free); + } +} + +/* +** fuzzy_editdist(A,B) +** +** Return the cost of transforming string A into string B. Both strings +** must be pure ASCII text. If A ends with '*' then it is assumed to be +** a prefix of B and extra characters on the end of B have minimal additional +** cost. +*/ +static void fuzzy_editdist(sqlite3_context* context, int argc, sqlite3_value** argv) { + int res = edit_distance((const char*)sqlite3_value_text(argv[0]), + (const char*)sqlite3_value_text(argv[1]), 0); + if (res < 0) { + if (res == (-3)) { + sqlite3_result_error_nomem(context); + } else if (res == (-2)) { + sqlite3_result_error(context, "non-ASCII input to editdist()", -1); + } else { + sqlite3_result_error(context, "NULL input to editdist()", -1); + } + } else { + sqlite3_result_int(context, res); + } +} + +/* +** fuzzy_translit(X) +** +** Convert a string that contains non-ASCII Roman characters into +** pure ASCII. +*/ +static void fuzzy_translit(sqlite3_context* context, int argc, sqlite3_value** argv) { + const unsigned char* zIn = sqlite3_value_text(argv[0]); + int nIn = sqlite3_value_bytes(argv[0]); + unsigned char* zOut = transliterate(zIn, nIn); + if (zOut == 0) { + sqlite3_result_error_nomem(context); + } else { + sqlite3_result_text(context, (char*)zOut, -1, free); + } +} + +/* +** fuzzy_script(X) +** +** Try to determine the dominant script used by the word X and return +** its ISO 15924 numeric code. +** +** The current implementation only understands the following scripts: +** +** 215 (Latin) +** 220 (Cyrillic) +** 200 (Greek) +** +** This routine will return 998 if the input X contains characters from +** two or more of the above scripts or 999 if X contains no characters +** from any of the above scripts. +*/ +static void fuzzy_script(sqlite3_context* context, int argc, sqlite3_value** argv) { + const unsigned char* zIn = sqlite3_value_text(argv[0]); + int nIn = sqlite3_value_bytes(argv[0]); + int res = script_code(zIn, nIn); + sqlite3_result_int(context, res); +} + +// Below are custom functions + +// fuzzy_caver implements Caverphone coding +static void fuzzy_caver(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + const unsigned char* source = sqlite3_value_text(argv[0]); + if (source == 0) { + return; + } + if (!is_ascii(source)) { + sqlite3_result_error(context, "argument should be ASCII string", -1); + return; + } + char* result = caverphone((const char*)source); + sqlite3_result_text(context, result, -1, free); +} + +int fuzzy_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + // libstrcmp + sqlite3_create_function(db, "fuzzy_damlev", 2, flags, 0, fuzzy_damlev, 0, 0); + sqlite3_create_function(db, "dlevenshtein", 2, flags, 0, fuzzy_damlev, 0, 0); + sqlite3_create_function(db, "fuzzy_hamming", 2, flags, 0, fuzzy_hamming, 0, 0); + sqlite3_create_function(db, "hamming", 2, flags, 0, fuzzy_hamming, 0, 0); + sqlite3_create_function(db, "fuzzy_jarowin", 2, flags, 0, fuzzy_jarowin, 0, 0); + sqlite3_create_function(db, "jaro_winkler", 2, flags, 0, fuzzy_jarowin, 0, 0); + sqlite3_create_function(db, "fuzzy_leven", 2, flags, 0, fuzzy_leven, 0, 0); + sqlite3_create_function(db, "levenshtein", 2, flags, 0, fuzzy_leven, 0, 0); + sqlite3_create_function(db, "fuzzy_osadist", 2, flags, 0, fuzzy_osadist, 0, 0); + sqlite3_create_function(db, "osa_distance", 2, flags, 0, fuzzy_osadist, 0, 0); + sqlite3_create_function(db, "fuzzy_soundex", 1, flags, 0, fuzzy_soundex, 0, 0); + sqlite3_create_function(db, "soundex", 1, flags, 0, fuzzy_soundex, 0, 0); + sqlite3_create_function(db, "fuzzy_rsoundex", 1, flags, 0, fuzzy_rsoundex, 0, 0); + sqlite3_create_function(db, "rsoundex", 1, flags, 0, fuzzy_rsoundex, 0, 0); + // spellfix + sqlite3_create_function(db, "fuzzy_editdist", 2, flags, 0, fuzzy_editdist, 0, 0); + sqlite3_create_function(db, "edit_distance", 2, flags, 0, fuzzy_editdist, 0, 0); + sqlite3_create_function(db, "fuzzy_phonetic", 1, flags, 0, fuzzy_phonetic, 0, 0); + sqlite3_create_function(db, "phonetic_hash", 1, flags, 0, fuzzy_phonetic, 0, 0); + sqlite3_create_function(db, "fuzzy_script", 1, flags, 0, fuzzy_script, 0, 0); + sqlite3_create_function(db, "script_code", 1, flags, 0, fuzzy_script, 0, 0); + sqlite3_create_function(db, "fuzzy_translit", 1, flags, 0, fuzzy_translit, 0, 0); + sqlite3_create_function(db, "translit", 1, flags, 0, fuzzy_translit, 0, 0); + // custom + sqlite3_create_function(db, "fuzzy_caver", 1, flags, 0, fuzzy_caver, 0, 0); + sqlite3_create_function(db, "caverphone", 1, flags, 0, fuzzy_caver, 0, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/extension.h b/libsql-ffi/bundled/sqlean/fuzzy/extension.h new file mode 100644 index 0000000000..d4e1610f1b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Fuzzy string matching and phonetics. + +#ifndef FUZZY_EXTENSION_H +#define FUZZY_EXTENSION_H + +#include "sqlite3ext.h" + +int fuzzy_init(sqlite3* db); + +#endif /* FUZZY_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/fuzzy/fuzzy.h b/libsql-ffi/bundled/sqlean/fuzzy/fuzzy.h new file mode 100644 index 0000000000..5a6a80342d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/fuzzy.h @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#ifndef FUZZY_H +#define FUZZY_H + +// distance metrics +unsigned damerau_levenshtein(const char*, const char*); +int hamming(const char*, const char*); +double jaro(const char*, const char*); +double jaro_winkler(const char*, const char*); +unsigned levenshtein(const char*, const char*); +unsigned optimal_string_alignment(const char*, const char*); +int edit_distance(const char*, const char*, int*); + +// phonetics +char* caverphone(const char*); +char* soundex(const char*); +char* refined_soundex(const char*); +unsigned char* phonetic_hash(const unsigned char*, int); + +// translit +unsigned char* transliterate(const unsigned char*, int); +int translen_to_charlen(const char*, int, int); +int script_code(const unsigned char*, int); + +#endif diff --git a/libsql-ffi/bundled/sqlean/fuzzy/hamming.c b/libsql-ffi/bundled/sqlean/fuzzy/hamming.c new file mode 100644 index 0000000000..e0234c3ece --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/hamming.c @@ -0,0 +1,46 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include + +#include "fuzzy/common.h" + +/// Computes and returns the hamming distance between two strings. Both strings +/// must have the same length and not be NULL. More information about the +/// algorithm can be found here: +/// http://en.wikipedia.org/wiki/Hamming_distance +/// +/// @param str1 first non NULL string +/// @param str2 second non NULL string +/// +/// @returns hamming distance or -1 if str1 and st2 did not have the same +/// length or if one or both str1 and str2 were NULL +int hamming(const char* str1, const char* str2) { + // strings cannot be NULL + assert(str1 != NULL); + assert(str2 != NULL); + + size_t str1_len = strlen(str1); + size_t str2_len = strlen(str2); + + // handle cases where strings have different lengths + if (str1_len != str2_len) { + return -1; + } + + // return 0 if strings are both empty, but not NULL + if (str1_len == 0 && str2_len == 0) { + return 0; + } + + int dist = 0; + while (str1_len > 0 && str2_len > 0) { + dist += (NOT_EQ(*str1, *str2)); + str1++, str2++; + str1_len--, str2_len--; + } + + return dist; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/jarowin.c b/libsql-ffi/bundled/sqlean/fuzzy/jarowin.c new file mode 100644 index 0000000000..b874aba6e0 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/jarowin.c @@ -0,0 +1,134 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include +#include + +#include "fuzzy/common.h" + +/// Calculates and returns the Jaro distance of two non NULL strings. +/// More information about the algorithm can be found here: +/// http://en.wikipedia.org/wiki/Jaro-Winkler_distance +/// +/// @param str1 first non NULL string +/// @param str2 second non NULL string +/// +/// @returns the jaro distance of str1 and str2 +double jaro(const char* str1, const char* str2) { + // strings cannot be NULL + assert(str1 != NULL); + assert(str2 != NULL); + + int str1_len = strlen(str1); + int str2_len = strlen(str2); + + // if both strings are empty return 1 + // if only one of the strings is empty return 0 + if (str1_len == 0) { + return (str2_len == 0) ? 1.0 : 0.0; + } + + // max distance between two chars to be considered matching + // floor() is ommitted due to integer division rules + int match_dist = (int)MAX(str1_len, str2_len) / 2 - 1; + + // arrays of bools that signify if that char in the matcing string has a + // match + int* str1_matches = calloc(str1_len, sizeof(int)); + int* str2_matches = calloc(str2_len, sizeof(int)); + + // number of matches and transpositions + double matches = 0.0; + double trans = 0.0; + + // find the matches + for (int i = 0; i < str1_len; i++) { + // start and end take into account the match distance + int start = MAX(0, i - match_dist); + int end = MIN(i + match_dist + 1, str2_len); + + for (int k = start; k < end; k++) { + // if str2 already has a match or str1 and str2 are not equal + // continue + if (str2_matches[k] || NOT_EQ(str1[i], str2[k])) { + continue; + } + + // otherwise assume there is a match + str1_matches[i] = true; + str2_matches[k] = true; + matches++; + break; + } + } + + // if there are no matches return 0 + if (matches == 0) { + free(str1_matches); + free(str2_matches); + return 0.0; + } + + // count transpositions + int k = 0; + for (int i = 0; i < str1_len; i++) { + // if there are no matches in str1 continue + if (!str1_matches[i]) { + continue; + } + + // while there is no match in str2 increment k + while (!str2_matches[k]) { + k++; + } + + // increment trans + if (NOT_EQ(str1[i], str2[k])) { + trans++; + } + + k++; + } + + // divide the number of transpositions by two as per the algorithm specs + // this division is valid because the counted transpositions include both + // instances of the transposed characters. + trans /= 2.0; + + // free allocated memory + free(str1_matches); + free(str2_matches); + + // return the jaro distance + return ((matches / str1_len) + (matches / str2_len) + ((matches - trans) / matches)) / 3.0; +} + +/// Calculates and returns the Jaro-Winkler distance of two non NULL strings. +/// More information about the algorithm can be found here: +/// http://en.wikipedia.org/wiki/Jaro-Winkler_distance +/// +/// @param str1 first non NULL string +/// @param str2 second non NULL string +/// +/// @returns the jaro-winkler distance of str1 and str2 +double jaro_winkler(const char* str1, const char* str2) { + // strings cannot be NULL + assert(str1 != NULL); + assert(str2 != NULL); + + // compute the jaro distance + double dist = jaro(str1, str2); + + // finds the number of common terms in the first 3 strings, max 3. + int prefix_length = 0; + if (strlen(str1) != 0 && strlen(str2) != 0) { + while (prefix_length < 3 && EQ(*str1++, *str2++)) { + prefix_length++; + } + } + + // 0.1 is the default scaling factor + return dist + prefix_length * 0.1 * (1 - dist); +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/leven.c b/libsql-ffi/bundled/sqlean/fuzzy/leven.c new file mode 100644 index 0000000000..12c815147f --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/leven.c @@ -0,0 +1,73 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include + +#include "fuzzy/common.h" + +/// Calculates and returns the Levenshtein distance of two non NULL strings. +/// More information about the algorithm can be found here: +/// https://en.wikipedia.org/wiki/Levenshtein_distance +/// +/// @param str1 first non NULL string +/// @param str2 second non NULL string +/// +/// @returns the levenshtein distance of str1 and str2 +unsigned levenshtein(const char* str1, const char* str2) { + // strings cannot be NULL + assert(str1 != NULL); + assert(str2 != NULL); + + size_t str1_len = strlen(str1); + size_t str2_len = strlen(str2); + + // handle cases where one or both strings are empty + if (str1_len == 0) { + return str2_len; + } + if (str2_len == 0) { + return str1_len; + } + + // remove common substring + while (str1_len > 0 && str2_len > 0 && EQ(str1[0], str2[0])) { + str1++, str2++; + str1_len--, str2_len--; + } + + // declare variables + unsigned row, col; + unsigned last_diag = 0, cur, cost; + + // initialize array to hold values + unsigned* vector = calloc(str1_len + 1, sizeof(unsigned)); + for (col = 1; col <= str1_len; col++) { + vector[col] = col; + } + + // itterate through the imagined rows of arrays + for (row = 1; row <= str2_len + 1; row++) { + vector[0] = row; + last_diag = row - 1; // remember the last first slot + + // itterate throught each member of the vector + for (col = 1; col <= str1_len; col++) { + // remember the diagonal before overwriting the array + cur = vector[col]; + + // calculate the cost + cost = EQ(str1[col - 1], str2[row - 1]) ? 0 : 1; + + // determine min of the possible values + vector[col] = MIN3(vector[col] + 1, vector[col - 1] + 1, last_diag + cost); + + // remember the new last_diag + last_diag = cur; + } + } + + free(vector); + return last_diag; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/osadist.c b/libsql-ffi/bundled/sqlean/fuzzy/osadist.c new file mode 100644 index 0000000000..98fddfdb71 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/osadist.c @@ -0,0 +1,84 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include + +#include "fuzzy/common.h" + +/// Computes and returns the Optimal String Alignment distance for two non NULL +/// strings. More information about the algorithm can be found here: +/// https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance +/// +/// @param str1 first non NULL string +/// @param str2 second non NULL string +/// +/// @returns optimal string alignment distance for str1 and str2 +unsigned optimal_string_alignment(const char* str1, const char* str2) { + // strings cannot be NULL + assert(str1 != NULL); + assert(str2 != NULL); + + size_t str1_len = strlen(str1); + size_t str2_len = strlen(str2); + + // handle cases where one or both strings are empty + if (str1_len == 0) { + return str2_len; + } + if (str2_len == 0) { + return str1_len; + } + + // remove common substring + while (str1_len > 0 && str2_len > 0 && EQ(str1[0], str2[0])) { + str1++, str2++; + str1_len--, str2_len--; + } + + unsigned row, col, cost, result; + + // initialize matrix to hold distance values + unsigned** matrix = malloc((str1_len + 1) * sizeof(unsigned*)); + for (unsigned i = 0; i <= str1_len; i++) { + matrix[i] = calloc((str2_len + 1), sizeof(unsigned)); + } + + // set all the starting values + matrix[0][0] = 0; + for (row = 1; row <= str1_len; row++) { + matrix[row][0] = row; + } + for (col = 1; col <= str2_len; col++) { + matrix[0][col] = col; + } + + // itterate through and fill in the matrix + for (row = 1; row <= str1_len; row++) { + for (col = 1; col <= str2_len; col++) { + cost = EQ(str1[row - 1], str2[col - 1]) ? 0 : 1; + + matrix[row][col] = MIN3(matrix[row - 1][col] + 1, // deletion + matrix[row][col - 1] + 1, // insertion + matrix[row - 1][col - 1] + cost // substitution + ); + + // transpositions + if (row > 1 && col > 1 && EQ(str1[row], str2[col - 1]) && + EQ(str1[row - 1], str2[col])) { + matrix[row][col] = MIN(matrix[row][col], matrix[row - 2][col - 2] + cost); + } + } + } + + result = matrix[str1_len][str2_len]; + + // free allocated memory + for (unsigned i = 0; i < str1_len + 1; i++) { + free(matrix[i]); + } + free(matrix); + + return result; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/phonetic.c b/libsql-ffi/bundled/sqlean/fuzzy/phonetic.c new file mode 100644 index 0000000000..31f9a1ae02 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/phonetic.c @@ -0,0 +1,87 @@ +// Ooriginally from the spellfix SQLite exension, Public Domain +// https://www.sqlite.org/src/file/ext/misc/spellfix.c +// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License + +#include +#include + +#include "fuzzy/common.h" + +extern const unsigned char midClass[]; +extern const unsigned char initClass[]; +extern const unsigned char className[]; + +/* +** Generate a "phonetic hash" from a string of ASCII characters +** in zIn[0..nIn-1]. +** +** * Map characters by character class as defined above. +** * Omit double-letters +** * Omit vowels beside R and L +** * Omit T when followed by CH +** * Omit W when followed by R +** * Omit D when followed by J or G +** * Omit K in KN or G in GN at the beginning of a word +** +** Space to hold the result is obtained from sqlite3_malloc() +** +** Return NULL if memory allocation fails. +*/ +unsigned char* phonetic_hash(const unsigned char* zIn, int nIn) { + unsigned char* zOut = malloc(nIn + 1); + int i; + int nOut = 0; + char cPrev = 0x77; + char cPrevX = 0x77; + const unsigned char* aClass = initClass; + + if (zOut == 0) + return 0; + if (nIn > 2) { + switch (zIn[0]) { + case 'g': + case 'k': { + if (zIn[1] == 'n') { + zIn++; + nIn--; + } + break; + } + } + } + for (i = 0; i < nIn; i++) { + unsigned char c = zIn[i]; + if (i + 1 < nIn) { + if (c == 'w' && zIn[i + 1] == 'r') + continue; + if (c == 'd' && (zIn[i + 1] == 'j' || zIn[i + 1] == 'g')) + continue; + if (i + 2 < nIn) { + if (c == 't' && zIn[i + 1] == 'c' && zIn[i + 2] == 'h') + continue; + } + } + c = aClass[c & 0x7f]; + if (c == CCLASS_SPACE) + continue; + if (c == CCLASS_OTHER && cPrev != CCLASS_DIGIT) + continue; + aClass = midClass; + if (c == CCLASS_VOWEL && (cPrevX == CCLASS_R || cPrevX == CCLASS_L)) { + continue; /* No vowels beside L or R */ + } + if ((c == CCLASS_R || c == CCLASS_L) && cPrevX == CCLASS_VOWEL) { + nOut--; /* No vowels beside L or R */ + } + cPrev = c; + if (c == CCLASS_SILENT) + continue; + cPrevX = c; + c = className[c]; + assert(nOut >= 0); + if (nOut == 0 || c != zOut[nOut - 1]) + zOut[nOut++] = c; + } + zOut[nOut] = 0; + return zOut; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/rsoundex.c b/libsql-ffi/bundled/sqlean/fuzzy/rsoundex.c new file mode 100644 index 0000000000..c2b66d878e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/rsoundex.c @@ -0,0 +1,121 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include +#include + +#include "fuzzy/common.h" + +/// Helper function that returns the numeric code for a given char as specified +/// by the refined soundex algorithm. +/// +/// @param c char to encode +/// +/// @returns char representation of the number associated with the given char +static char rsoundex_encode(const char c) { + switch (tolower(c)) { + case 'b': + case 'p': + return '1'; + + case 'f': + case 'v': + return '2'; + + case 'c': + case 'k': + case 's': + return '3'; + + case 'g': + case 'j': + return '4'; + + case 'q': + case 'x': + case 'z': + return '5'; + + case 'd': + case 't': + return '6'; + + case 'l': + return '7'; + + case 'm': + case 'n': + return '8'; + + case 'r': + return '9'; + + default: + break; + } + + return '0'; +} + +/// Computes and returns the soundex representation of a given non NULL string. +/// More information about the algorithm can be found here: +/// http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html +/// +/// @param str non NULL string to encode +/// +/// @returns soundex representation of str +char* refined_soundex(const char* str) { + // string cannot be NULL + assert(str != NULL); + + size_t str_len = strlen(str); + + // final code buffer + char* code = malloc((str_len + 1) * sizeof(char)); + + // temporary buffer to encode string + char* buf = malloc((str_len + 1) * sizeof(char)); + + // set first value to first char in str + code[0] = toupper(str[0]); + + // number of digits in code + unsigned d = 1; + + // encode all chars in str + for (unsigned i = 0; i < str_len; i++) + buf[i] = rsoundex_encode(str[i]); + + // add all viable chars to code + char prev = '\0'; + for (unsigned i = 0; i < str_len; i++) { + // check if current char in buf is not the same as previous char + if (NOT_EQ(buf[i], prev)) { + // add digit to the code + code[d] = buf[i]; + + // increment digit counter + d++; + + // set prev to current char + prev = buf[i]; + } + } + + // allocate space for final code + // d will be length of the code + 1 + char* result = malloc((d + 1) * sizeof(char)); + + // copy final code into result and null terminate + for (unsigned i = 0; i < d; i++) { + result[i] = code[i]; + } + result[d] = '\0'; + + free(code); + free(buf); + + return result; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/soundex.c b/libsql-ffi/bundled/sqlean/fuzzy/soundex.c new file mode 100644 index 0000000000..4f6c58f915 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/soundex.c @@ -0,0 +1,115 @@ +// Copyright (c) 2014 Ross Bayer, MIT License +// https://github.com/Rostepher/libstrcmp + +#include +#include +#include +#include + +#include "fuzzy/common.h" + +/// Helper function that returns the numeric code for a given char as specified +/// by the soundex algorithm. +/// +/// @param c char to encode +/// +/// @returns char representation of the number associated with the given char +static char soundex_encode(const char c) { + switch (tolower(c)) { + case 'b': + case 'f': + case 'p': + case 'v': + return '1'; + + case 'c': + case 'g': + case 'j': + case 'k': + case 'q': + case 's': + case 'x': + case 'z': + return '2'; + + case 'd': + case 't': + return '3'; + + case 'l': + return '4'; + + case 'm': + case 'n': + return '5'; + + case 'r': + return '6'; + + default: + break; + } + + return '0'; +} + +/// Computes and returns the soundex representation of a given non NULL string. +/// More information about the algorithm can be found here: +/// https://en.wikipedia.org/wiki/Soundex +/// +/// @param str non NULL string to encode +/// +/// @returns soundex representation of str +char* soundex(const char* str) { + // string cannot be NULL + assert(str != NULL); + + size_t str_len = strlen(str); + + // allocate space for final code and null terminator + char* code = malloc(5 * sizeof(char)); + + // temporary buffer to encode string + char* buf = malloc((str_len + 1) * sizeof(char)); + + // set first value to first char in str + code[0] = toupper(str[0]); + + // number of digits in code + unsigned d = 1; + + // encode all chars in str + for (unsigned i = 0; i < str_len; i++) { + buf[i] = soundex_encode(str[i]); + } + + // add all viable chars to code + for (unsigned i = 1; i < str_len && d < 4; i++) { + // check if current char in buf is not the same as previous char + // and that the current char is not '0' + if (NOT_EQ(buf[i], buf[i - 1]) && NOT_EQ(buf[i], '0')) { + // if digits separated by an 'h' or 'w' are the same, skip them + if (i > 1 && EQ(buf[i], buf[i - 2]) && strchr("hw", str[i - 1])) { + continue; + } + + // add digit to the code + code[d] = buf[i]; + + // increment digit counter + d++; + } + } + + // pad the end of code with '0' if too short + while (d < 4) { + code[d] = '0'; + d++; + } + + // null terminate string + code[d] = '\0'; + free(buf); + + return code; +} diff --git a/libsql-ffi/bundled/sqlean/fuzzy/translit.c b/libsql-ffi/bundled/sqlean/fuzzy/translit.c new file mode 100644 index 0000000000..cc9290e784 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/fuzzy/translit.c @@ -0,0 +1,610 @@ +// Originally from the spellfix SQLite exension, Public Domain +// https://www.sqlite.org/src/file/ext/misc/spellfix.c +// Modified by Anton Zhiyanov, https://github.com/nalgeon/sqlean/, MIT License + +#include + +#include "fuzzy/common.h" + +extern const unsigned char midClass[]; +extern const unsigned char initClass[]; +extern const unsigned char className[]; + +/* +** This lookup table is used to help decode the first byte of +** a multi-byte UTF8 character. +*/ +static const unsigned char translit_utf8_lookup[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, +}; + +/* +** Return the value of the first UTF-8 character in the string. +*/ +static int utf8Read(const unsigned char* z, int n, int* pSize) { + int c, i; + + /* All callers to this routine (in the current implementation) + ** always have n>0. */ + if (NEVER(n == 0)) { + c = i = 0; + } else { + c = z[0]; + i = 1; + if (c >= 0xc0) { + c = translit_utf8_lookup[c - 0xc0]; + while (i < n && (z[i] & 0xc0) == 0x80) { + c = (c << 6) + (0x3f & z[i++]); + } + } + } + *pSize = i; + return c; +} + +typedef struct Transliteration Transliteration; +struct Transliteration { + unsigned short int cFrom; + unsigned char cTo0, cTo1, cTo2, cTo3; +}; + +/* +** Table of translations from unicode characters into ASCII. +*/ +static const Transliteration translit[] = { + {0x00A0, 0x20, 0x00, 0x00, 0x00}, /*   to */ + {0x00B5, 0x75, 0x00, 0x00, 0x00}, /* µ to u */ + {0x00C0, 0x41, 0x00, 0x00, 0x00}, /* À to A */ + {0x00C1, 0x41, 0x00, 0x00, 0x00}, /* Á to A */ + {0x00C2, 0x41, 0x00, 0x00, 0x00}, /*  to A */ + {0x00C3, 0x41, 0x00, 0x00, 0x00}, /* à to A */ + {0x00C4, 0x41, 0x65, 0x00, 0x00}, /* Ä to Ae */ + {0x00C5, 0x41, 0x61, 0x00, 0x00}, /* Å to Aa */ + {0x00C6, 0x41, 0x45, 0x00, 0x00}, /* Æ to AE */ + {0x00C7, 0x43, 0x00, 0x00, 0x00}, /* Ç to C */ + {0x00C8, 0x45, 0x00, 0x00, 0x00}, /* È to E */ + {0x00C9, 0x45, 0x00, 0x00, 0x00}, /* É to E */ + {0x00CA, 0x45, 0x00, 0x00, 0x00}, /* Ê to E */ + {0x00CB, 0x45, 0x00, 0x00, 0x00}, /* Ë to E */ + {0x00CC, 0x49, 0x00, 0x00, 0x00}, /* Ì to I */ + {0x00CD, 0x49, 0x00, 0x00, 0x00}, /* Í to I */ + {0x00CE, 0x49, 0x00, 0x00, 0x00}, /* Î to I */ + {0x00CF, 0x49, 0x00, 0x00, 0x00}, /* Ï to I */ + {0x00D0, 0x44, 0x00, 0x00, 0x00}, /* Ð to D */ + {0x00D1, 0x4E, 0x00, 0x00, 0x00}, /* Ñ to N */ + {0x00D2, 0x4F, 0x00, 0x00, 0x00}, /* Ò to O */ + {0x00D3, 0x4F, 0x00, 0x00, 0x00}, /* Ó to O */ + {0x00D4, 0x4F, 0x00, 0x00, 0x00}, /* Ô to O */ + {0x00D5, 0x4F, 0x00, 0x00, 0x00}, /* Õ to O */ + {0x00D6, 0x4F, 0x65, 0x00, 0x00}, /* Ö to Oe */ + {0x00D7, 0x78, 0x00, 0x00, 0x00}, /* × to x */ + {0x00D8, 0x4F, 0x00, 0x00, 0x00}, /* Ø to O */ + {0x00D9, 0x55, 0x00, 0x00, 0x00}, /* Ù to U */ + {0x00DA, 0x55, 0x00, 0x00, 0x00}, /* Ú to U */ + {0x00DB, 0x55, 0x00, 0x00, 0x00}, /* Û to U */ + {0x00DC, 0x55, 0x65, 0x00, 0x00}, /* Ü to Ue */ + {0x00DD, 0x59, 0x00, 0x00, 0x00}, /* Ý to Y */ + {0x00DE, 0x54, 0x68, 0x00, 0x00}, /* Þ to Th */ + {0x00DF, 0x73, 0x73, 0x00, 0x00}, /* ß to ss */ + {0x00E0, 0x61, 0x00, 0x00, 0x00}, /* à to a */ + {0x00E1, 0x61, 0x00, 0x00, 0x00}, /* á to a */ + {0x00E2, 0x61, 0x00, 0x00, 0x00}, /* â to a */ + {0x00E3, 0x61, 0x00, 0x00, 0x00}, /* ã to a */ + {0x00E4, 0x61, 0x65, 0x00, 0x00}, /* ä to ae */ + {0x00E5, 0x61, 0x61, 0x00, 0x00}, /* å to aa */ + {0x00E6, 0x61, 0x65, 0x00, 0x00}, /* æ to ae */ + {0x00E7, 0x63, 0x00, 0x00, 0x00}, /* ç to c */ + {0x00E8, 0x65, 0x00, 0x00, 0x00}, /* è to e */ + {0x00E9, 0x65, 0x00, 0x00, 0x00}, /* é to e */ + {0x00EA, 0x65, 0x00, 0x00, 0x00}, /* ê to e */ + {0x00EB, 0x65, 0x00, 0x00, 0x00}, /* ë to e */ + {0x00EC, 0x69, 0x00, 0x00, 0x00}, /* ì to i */ + {0x00ED, 0x69, 0x00, 0x00, 0x00}, /* í to i */ + {0x00EE, 0x69, 0x00, 0x00, 0x00}, /* î to i */ + {0x00EF, 0x69, 0x00, 0x00, 0x00}, /* ï to i */ + {0x00F0, 0x64, 0x00, 0x00, 0x00}, /* ð to d */ + {0x00F1, 0x6E, 0x00, 0x00, 0x00}, /* ñ to n */ + {0x00F2, 0x6F, 0x00, 0x00, 0x00}, /* ò to o */ + {0x00F3, 0x6F, 0x00, 0x00, 0x00}, /* ó to o */ + {0x00F4, 0x6F, 0x00, 0x00, 0x00}, /* ô to o */ + {0x00F5, 0x6F, 0x00, 0x00, 0x00}, /* õ to o */ + {0x00F6, 0x6F, 0x65, 0x00, 0x00}, /* ö to oe */ + {0x00F7, 0x3A, 0x00, 0x00, 0x00}, /* ÷ to : */ + {0x00F8, 0x6F, 0x00, 0x00, 0x00}, /* ø to o */ + {0x00F9, 0x75, 0x00, 0x00, 0x00}, /* ù to u */ + {0x00FA, 0x75, 0x00, 0x00, 0x00}, /* ú to u */ + {0x00FB, 0x75, 0x00, 0x00, 0x00}, /* û to u */ + {0x00FC, 0x75, 0x65, 0x00, 0x00}, /* ü to ue */ + {0x00FD, 0x79, 0x00, 0x00, 0x00}, /* ý to y */ + {0x00FE, 0x74, 0x68, 0x00, 0x00}, /* þ to th */ + {0x00FF, 0x79, 0x00, 0x00, 0x00}, /* ÿ to y */ + {0x0100, 0x41, 0x00, 0x00, 0x00}, /* Ā to A */ + {0x0101, 0x61, 0x00, 0x00, 0x00}, /* ā to a */ + {0x0102, 0x41, 0x00, 0x00, 0x00}, /* Ă to A */ + {0x0103, 0x61, 0x00, 0x00, 0x00}, /* ă to a */ + {0x0104, 0x41, 0x00, 0x00, 0x00}, /* Ą to A */ + {0x0105, 0x61, 0x00, 0x00, 0x00}, /* ą to a */ + {0x0106, 0x43, 0x00, 0x00, 0x00}, /* Ć to C */ + {0x0107, 0x63, 0x00, 0x00, 0x00}, /* ć to c */ + {0x0108, 0x43, 0x68, 0x00, 0x00}, /* Ĉ to Ch */ + {0x0109, 0x63, 0x68, 0x00, 0x00}, /* ĉ to ch */ + {0x010A, 0x43, 0x00, 0x00, 0x00}, /* Ċ to C */ + {0x010B, 0x63, 0x00, 0x00, 0x00}, /* ċ to c */ + {0x010C, 0x43, 0x00, 0x00, 0x00}, /* Č to C */ + {0x010D, 0x63, 0x00, 0x00, 0x00}, /* č to c */ + {0x010E, 0x44, 0x00, 0x00, 0x00}, /* Ď to D */ + {0x010F, 0x64, 0x00, 0x00, 0x00}, /* ď to d */ + {0x0110, 0x44, 0x00, 0x00, 0x00}, /* Đ to D */ + {0x0111, 0x64, 0x00, 0x00, 0x00}, /* đ to d */ + {0x0112, 0x45, 0x00, 0x00, 0x00}, /* Ē to E */ + {0x0113, 0x65, 0x00, 0x00, 0x00}, /* ē to e */ + {0x0114, 0x45, 0x00, 0x00, 0x00}, /* Ĕ to E */ + {0x0115, 0x65, 0x00, 0x00, 0x00}, /* ĕ to e */ + {0x0116, 0x45, 0x00, 0x00, 0x00}, /* Ė to E */ + {0x0117, 0x65, 0x00, 0x00, 0x00}, /* ė to e */ + {0x0118, 0x45, 0x00, 0x00, 0x00}, /* Ę to E */ + {0x0119, 0x65, 0x00, 0x00, 0x00}, /* ę to e */ + {0x011A, 0x45, 0x00, 0x00, 0x00}, /* Ě to E */ + {0x011B, 0x65, 0x00, 0x00, 0x00}, /* ě to e */ + {0x011C, 0x47, 0x68, 0x00, 0x00}, /* Ĝ to Gh */ + {0x011D, 0x67, 0x68, 0x00, 0x00}, /* ĝ to gh */ + {0x011E, 0x47, 0x00, 0x00, 0x00}, /* Ğ to G */ + {0x011F, 0x67, 0x00, 0x00, 0x00}, /* ğ to g */ + {0x0120, 0x47, 0x00, 0x00, 0x00}, /* Ġ to G */ + {0x0121, 0x67, 0x00, 0x00, 0x00}, /* ġ to g */ + {0x0122, 0x47, 0x00, 0x00, 0x00}, /* Ģ to G */ + {0x0123, 0x67, 0x00, 0x00, 0x00}, /* ģ to g */ + {0x0124, 0x48, 0x68, 0x00, 0x00}, /* Ĥ to Hh */ + {0x0125, 0x68, 0x68, 0x00, 0x00}, /* ĥ to hh */ + {0x0126, 0x48, 0x00, 0x00, 0x00}, /* Ħ to H */ + {0x0127, 0x68, 0x00, 0x00, 0x00}, /* ħ to h */ + {0x0128, 0x49, 0x00, 0x00, 0x00}, /* Ĩ to I */ + {0x0129, 0x69, 0x00, 0x00, 0x00}, /* ĩ to i */ + {0x012A, 0x49, 0x00, 0x00, 0x00}, /* Ī to I */ + {0x012B, 0x69, 0x00, 0x00, 0x00}, /* ī to i */ + {0x012C, 0x49, 0x00, 0x00, 0x00}, /* Ĭ to I */ + {0x012D, 0x69, 0x00, 0x00, 0x00}, /* ĭ to i */ + {0x012E, 0x49, 0x00, 0x00, 0x00}, /* Į to I */ + {0x012F, 0x69, 0x00, 0x00, 0x00}, /* į to i */ + {0x0130, 0x49, 0x00, 0x00, 0x00}, /* İ to I */ + {0x0131, 0x69, 0x00, 0x00, 0x00}, /* ı to i */ + {0x0132, 0x49, 0x4A, 0x00, 0x00}, /* IJ to IJ */ + {0x0133, 0x69, 0x6A, 0x00, 0x00}, /* ij to ij */ + {0x0134, 0x4A, 0x68, 0x00, 0x00}, /* Ĵ to Jh */ + {0x0135, 0x6A, 0x68, 0x00, 0x00}, /* ĵ to jh */ + {0x0136, 0x4B, 0x00, 0x00, 0x00}, /* Ķ to K */ + {0x0137, 0x6B, 0x00, 0x00, 0x00}, /* ķ to k */ + {0x0138, 0x6B, 0x00, 0x00, 0x00}, /* ĸ to k */ + {0x0139, 0x4C, 0x00, 0x00, 0x00}, /* Ĺ to L */ + {0x013A, 0x6C, 0x00, 0x00, 0x00}, /* ĺ to l */ + {0x013B, 0x4C, 0x00, 0x00, 0x00}, /* Ļ to L */ + {0x013C, 0x6C, 0x00, 0x00, 0x00}, /* ļ to l */ + {0x013D, 0x4C, 0x00, 0x00, 0x00}, /* Ľ to L */ + {0x013E, 0x6C, 0x00, 0x00, 0x00}, /* ľ to l */ + {0x013F, 0x4C, 0x2E, 0x00, 0x00}, /* Ŀ to L. */ + {0x0140, 0x6C, 0x2E, 0x00, 0x00}, /* ŀ to l. */ + {0x0141, 0x4C, 0x00, 0x00, 0x00}, /* Ł to L */ + {0x0142, 0x6C, 0x00, 0x00, 0x00}, /* ł to l */ + {0x0143, 0x4E, 0x00, 0x00, 0x00}, /* Ń to N */ + {0x0144, 0x6E, 0x00, 0x00, 0x00}, /* ń to n */ + {0x0145, 0x4E, 0x00, 0x00, 0x00}, /* Ņ to N */ + {0x0146, 0x6E, 0x00, 0x00, 0x00}, /* ņ to n */ + {0x0147, 0x4E, 0x00, 0x00, 0x00}, /* Ň to N */ + {0x0148, 0x6E, 0x00, 0x00, 0x00}, /* ň to n */ + {0x0149, 0x27, 0x6E, 0x00, 0x00}, /* ʼn to 'n */ + {0x014A, 0x4E, 0x47, 0x00, 0x00}, /* Ŋ to NG */ + {0x014B, 0x6E, 0x67, 0x00, 0x00}, /* ŋ to ng */ + {0x014C, 0x4F, 0x00, 0x00, 0x00}, /* Ō to O */ + {0x014D, 0x6F, 0x00, 0x00, 0x00}, /* ō to o */ + {0x014E, 0x4F, 0x00, 0x00, 0x00}, /* Ŏ to O */ + {0x014F, 0x6F, 0x00, 0x00, 0x00}, /* ŏ to o */ + {0x0150, 0x4F, 0x00, 0x00, 0x00}, /* Ő to O */ + {0x0151, 0x6F, 0x00, 0x00, 0x00}, /* ő to o */ + {0x0152, 0x4F, 0x45, 0x00, 0x00}, /* Œ to OE */ + {0x0153, 0x6F, 0x65, 0x00, 0x00}, /* œ to oe */ + {0x0154, 0x52, 0x00, 0x00, 0x00}, /* Ŕ to R */ + {0x0155, 0x72, 0x00, 0x00, 0x00}, /* ŕ to r */ + {0x0156, 0x52, 0x00, 0x00, 0x00}, /* Ŗ to R */ + {0x0157, 0x72, 0x00, 0x00, 0x00}, /* ŗ to r */ + {0x0158, 0x52, 0x00, 0x00, 0x00}, /* Ř to R */ + {0x0159, 0x72, 0x00, 0x00, 0x00}, /* ř to r */ + {0x015A, 0x53, 0x00, 0x00, 0x00}, /* Ś to S */ + {0x015B, 0x73, 0x00, 0x00, 0x00}, /* ś to s */ + {0x015C, 0x53, 0x68, 0x00, 0x00}, /* Ŝ to Sh */ + {0x015D, 0x73, 0x68, 0x00, 0x00}, /* ŝ to sh */ + {0x015E, 0x53, 0x00, 0x00, 0x00}, /* Ş to S */ + {0x015F, 0x73, 0x00, 0x00, 0x00}, /* ş to s */ + {0x0160, 0x53, 0x00, 0x00, 0x00}, /* Š to S */ + {0x0161, 0x73, 0x00, 0x00, 0x00}, /* š to s */ + {0x0162, 0x54, 0x00, 0x00, 0x00}, /* Ţ to T */ + {0x0163, 0x74, 0x00, 0x00, 0x00}, /* ţ to t */ + {0x0164, 0x54, 0x00, 0x00, 0x00}, /* Ť to T */ + {0x0165, 0x74, 0x00, 0x00, 0x00}, /* ť to t */ + {0x0166, 0x54, 0x00, 0x00, 0x00}, /* Ŧ to T */ + {0x0167, 0x74, 0x00, 0x00, 0x00}, /* ŧ to t */ + {0x0168, 0x55, 0x00, 0x00, 0x00}, /* Ũ to U */ + {0x0169, 0x75, 0x00, 0x00, 0x00}, /* ũ to u */ + {0x016A, 0x55, 0x00, 0x00, 0x00}, /* Ū to U */ + {0x016B, 0x75, 0x00, 0x00, 0x00}, /* ū to u */ + {0x016C, 0x55, 0x00, 0x00, 0x00}, /* Ŭ to U */ + {0x016D, 0x75, 0x00, 0x00, 0x00}, /* ŭ to u */ + {0x016E, 0x55, 0x00, 0x00, 0x00}, /* Ů to U */ + {0x016F, 0x75, 0x00, 0x00, 0x00}, /* ů to u */ + {0x0170, 0x55, 0x00, 0x00, 0x00}, /* Ű to U */ + {0x0171, 0x75, 0x00, 0x00, 0x00}, /* ű to u */ + {0x0172, 0x55, 0x00, 0x00, 0x00}, /* Ų to U */ + {0x0173, 0x75, 0x00, 0x00, 0x00}, /* ų to u */ + {0x0174, 0x57, 0x00, 0x00, 0x00}, /* Ŵ to W */ + {0x0175, 0x77, 0x00, 0x00, 0x00}, /* ŵ to w */ + {0x0176, 0x59, 0x00, 0x00, 0x00}, /* Ŷ to Y */ + {0x0177, 0x79, 0x00, 0x00, 0x00}, /* ŷ to y */ + {0x0178, 0x59, 0x00, 0x00, 0x00}, /* Ÿ to Y */ + {0x0179, 0x5A, 0x00, 0x00, 0x00}, /* Ź to Z */ + {0x017A, 0x7A, 0x00, 0x00, 0x00}, /* ź to z */ + {0x017B, 0x5A, 0x00, 0x00, 0x00}, /* Ż to Z */ + {0x017C, 0x7A, 0x00, 0x00, 0x00}, /* ż to z */ + {0x017D, 0x5A, 0x00, 0x00, 0x00}, /* Ž to Z */ + {0x017E, 0x7A, 0x00, 0x00, 0x00}, /* ž to z */ + {0x017F, 0x73, 0x00, 0x00, 0x00}, /* ſ to s */ + {0x0192, 0x66, 0x00, 0x00, 0x00}, /* ƒ to f */ + {0x0218, 0x53, 0x00, 0x00, 0x00}, /* Ș to S */ + {0x0219, 0x73, 0x00, 0x00, 0x00}, /* ș to s */ + {0x021A, 0x54, 0x00, 0x00, 0x00}, /* Ț to T */ + {0x021B, 0x74, 0x00, 0x00, 0x00}, /* ț to t */ + {0x0386, 0x41, 0x00, 0x00, 0x00}, /* Ά to A */ + {0x0388, 0x45, 0x00, 0x00, 0x00}, /* Έ to E */ + {0x0389, 0x49, 0x00, 0x00, 0x00}, /* Ή to I */ + {0x038A, 0x49, 0x00, 0x00, 0x00}, /* Ί to I */ + {0x038C, 0x4f, 0x00, 0x00, 0x00}, /* Ό to O */ + {0x038E, 0x59, 0x00, 0x00, 0x00}, /* Ύ to Y */ + {0x038F, 0x4f, 0x00, 0x00, 0x00}, /* Ώ to O */ + {0x0390, 0x69, 0x00, 0x00, 0x00}, /* ΐ to i */ + {0x0391, 0x41, 0x00, 0x00, 0x00}, /* Α to A */ + {0x0392, 0x42, 0x00, 0x00, 0x00}, /* Β to B */ + {0x0393, 0x47, 0x00, 0x00, 0x00}, /* Γ to G */ + {0x0394, 0x44, 0x00, 0x00, 0x00}, /* Δ to D */ + {0x0395, 0x45, 0x00, 0x00, 0x00}, /* Ε to E */ + {0x0396, 0x5a, 0x00, 0x00, 0x00}, /* Ζ to Z */ + {0x0397, 0x49, 0x00, 0x00, 0x00}, /* Η to I */ + {0x0398, 0x54, 0x68, 0x00, 0x00}, /* Θ to Th */ + {0x0399, 0x49, 0x00, 0x00, 0x00}, /* Ι to I */ + {0x039A, 0x4b, 0x00, 0x00, 0x00}, /* Κ to K */ + {0x039B, 0x4c, 0x00, 0x00, 0x00}, /* Λ to L */ + {0x039C, 0x4d, 0x00, 0x00, 0x00}, /* Μ to M */ + {0x039D, 0x4e, 0x00, 0x00, 0x00}, /* Ν to N */ + {0x039E, 0x58, 0x00, 0x00, 0x00}, /* Ξ to X */ + {0x039F, 0x4f, 0x00, 0x00, 0x00}, /* Ο to O */ + {0x03A0, 0x50, 0x00, 0x00, 0x00}, /* Π to P */ + {0x03A1, 0x52, 0x00, 0x00, 0x00}, /* Ρ to R */ + {0x03A3, 0x53, 0x00, 0x00, 0x00}, /* Σ to S */ + {0x03A4, 0x54, 0x00, 0x00, 0x00}, /* Τ to T */ + {0x03A5, 0x59, 0x00, 0x00, 0x00}, /* Υ to Y */ + {0x03A6, 0x46, 0x00, 0x00, 0x00}, /* Φ to F */ + {0x03A7, 0x43, 0x68, 0x00, 0x00}, /* Χ to Ch */ + {0x03A8, 0x50, 0x73, 0x00, 0x00}, /* Ψ to Ps */ + {0x03A9, 0x4f, 0x00, 0x00, 0x00}, /* Ω to O */ + {0x03AA, 0x49, 0x00, 0x00, 0x00}, /* Ϊ to I */ + {0x03AB, 0x59, 0x00, 0x00, 0x00}, /* Ϋ to Y */ + {0x03AC, 0x61, 0x00, 0x00, 0x00}, /* ά to a */ + {0x03AD, 0x65, 0x00, 0x00, 0x00}, /* έ to e */ + {0x03AE, 0x69, 0x00, 0x00, 0x00}, /* ή to i */ + {0x03AF, 0x69, 0x00, 0x00, 0x00}, /* ί to i */ + {0x03B1, 0x61, 0x00, 0x00, 0x00}, /* α to a */ + {0x03B2, 0x62, 0x00, 0x00, 0x00}, /* β to b */ + {0x03B3, 0x67, 0x00, 0x00, 0x00}, /* γ to g */ + {0x03B4, 0x64, 0x00, 0x00, 0x00}, /* δ to d */ + {0x03B5, 0x65, 0x00, 0x00, 0x00}, /* ε to e */ + {0x03B6, 0x7a, 0x00, 0x00, 0x00}, /* ζ to z */ + {0x03B7, 0x69, 0x00, 0x00, 0x00}, /* η to i */ + {0x03B8, 0x74, 0x68, 0x00, 0x00}, /* θ to th */ + {0x03B9, 0x69, 0x00, 0x00, 0x00}, /* ι to i */ + {0x03BA, 0x6b, 0x00, 0x00, 0x00}, /* κ to k */ + {0x03BB, 0x6c, 0x00, 0x00, 0x00}, /* λ to l */ + {0x03BC, 0x6d, 0x00, 0x00, 0x00}, /* μ to m */ + {0x03BD, 0x6e, 0x00, 0x00, 0x00}, /* ν to n */ + {0x03BE, 0x78, 0x00, 0x00, 0x00}, /* ξ to x */ + {0x03BF, 0x6f, 0x00, 0x00, 0x00}, /* ο to o */ + {0x03C0, 0x70, 0x00, 0x00, 0x00}, /* π to p */ + {0x03C1, 0x72, 0x00, 0x00, 0x00}, /* ρ to r */ + {0x03C3, 0x73, 0x00, 0x00, 0x00}, /* σ to s */ + {0x03C4, 0x74, 0x00, 0x00, 0x00}, /* τ to t */ + {0x03C5, 0x79, 0x00, 0x00, 0x00}, /* υ to y */ + {0x03C6, 0x66, 0x00, 0x00, 0x00}, /* φ to f */ + {0x03C7, 0x63, 0x68, 0x00, 0x00}, /* χ to ch */ + {0x03C8, 0x70, 0x73, 0x00, 0x00}, /* ψ to ps */ + {0x03C9, 0x6f, 0x00, 0x00, 0x00}, /* ω to o */ + {0x03CA, 0x69, 0x00, 0x00, 0x00}, /* ϊ to i */ + {0x03CB, 0x79, 0x00, 0x00, 0x00}, /* ϋ to y */ + {0x03CC, 0x6f, 0x00, 0x00, 0x00}, /* ό to o */ + {0x03CD, 0x79, 0x00, 0x00, 0x00}, /* ύ to y */ + {0x03CE, 0x69, 0x00, 0x00, 0x00}, /* ώ to i */ + {0x0400, 0x45, 0x00, 0x00, 0x00}, /* Ѐ to E */ + {0x0401, 0x45, 0x00, 0x00, 0x00}, /* Ё to E */ + {0x0402, 0x44, 0x00, 0x00, 0x00}, /* Ђ to D */ + {0x0403, 0x47, 0x00, 0x00, 0x00}, /* Ѓ to G */ + {0x0404, 0x45, 0x00, 0x00, 0x00}, /* Є to E */ + {0x0405, 0x5a, 0x00, 0x00, 0x00}, /* Ѕ to Z */ + {0x0406, 0x49, 0x00, 0x00, 0x00}, /* І to I */ + {0x0407, 0x49, 0x00, 0x00, 0x00}, /* Ї to I */ + {0x0408, 0x4a, 0x00, 0x00, 0x00}, /* Ј to J */ + {0x0409, 0x49, 0x00, 0x00, 0x00}, /* Љ to I */ + {0x040A, 0x4e, 0x00, 0x00, 0x00}, /* Њ to N */ + {0x040B, 0x44, 0x00, 0x00, 0x00}, /* Ћ to D */ + {0x040C, 0x4b, 0x00, 0x00, 0x00}, /* Ќ to K */ + {0x040D, 0x49, 0x00, 0x00, 0x00}, /* Ѝ to I */ + {0x040E, 0x55, 0x00, 0x00, 0x00}, /* Ў to U */ + {0x040F, 0x44, 0x00, 0x00, 0x00}, /* Џ to D */ + {0x0410, 0x41, 0x00, 0x00, 0x00}, /* А to A */ + {0x0411, 0x42, 0x00, 0x00, 0x00}, /* Б to B */ + {0x0412, 0x56, 0x00, 0x00, 0x00}, /* В to V */ + {0x0413, 0x47, 0x00, 0x00, 0x00}, /* Г to G */ + {0x0414, 0x44, 0x00, 0x00, 0x00}, /* Д to D */ + {0x0415, 0x45, 0x00, 0x00, 0x00}, /* Е to E */ + {0x0416, 0x5a, 0x68, 0x00, 0x00}, /* Ж to Zh */ + {0x0417, 0x5a, 0x00, 0x00, 0x00}, /* З to Z */ + {0x0418, 0x49, 0x00, 0x00, 0x00}, /* И to I */ + {0x0419, 0x49, 0x00, 0x00, 0x00}, /* Й to I */ + {0x041A, 0x4b, 0x00, 0x00, 0x00}, /* К to K */ + {0x041B, 0x4c, 0x00, 0x00, 0x00}, /* Л to L */ + {0x041C, 0x4d, 0x00, 0x00, 0x00}, /* М to M */ + {0x041D, 0x4e, 0x00, 0x00, 0x00}, /* Н to N */ + {0x041E, 0x4f, 0x00, 0x00, 0x00}, /* О to O */ + {0x041F, 0x50, 0x00, 0x00, 0x00}, /* П to P */ + {0x0420, 0x52, 0x00, 0x00, 0x00}, /* Р to R */ + {0x0421, 0x53, 0x00, 0x00, 0x00}, /* С to S */ + {0x0422, 0x54, 0x00, 0x00, 0x00}, /* Т to T */ + {0x0423, 0x55, 0x00, 0x00, 0x00}, /* У to U */ + {0x0424, 0x46, 0x00, 0x00, 0x00}, /* Ф to F */ + {0x0425, 0x4b, 0x68, 0x00, 0x00}, /* Х to Kh */ + {0x0426, 0x54, 0x63, 0x00, 0x00}, /* Ц to Tc */ + {0x0427, 0x43, 0x68, 0x00, 0x00}, /* Ч to Ch */ + {0x0428, 0x53, 0x68, 0x00, 0x00}, /* Ш to Sh */ + {0x0429, 0x53, 0x68, 0x63, 0x68}, /* Щ to Shch */ + {0x042A, 0x61, 0x00, 0x00, 0x00}, /* to A */ + {0x042B, 0x59, 0x00, 0x00, 0x00}, /* Ы to Y */ + {0x042C, 0x59, 0x00, 0x00, 0x00}, /* to Y */ + {0x042D, 0x45, 0x00, 0x00, 0x00}, /* Э to E */ + {0x042E, 0x49, 0x75, 0x00, 0x00}, /* Ю to Iu */ + {0x042F, 0x49, 0x61, 0x00, 0x00}, /* Я to Ia */ + {0x0430, 0x61, 0x00, 0x00, 0x00}, /* а to a */ + {0x0431, 0x62, 0x00, 0x00, 0x00}, /* б to b */ + {0x0432, 0x76, 0x00, 0x00, 0x00}, /* в to v */ + {0x0433, 0x67, 0x00, 0x00, 0x00}, /* г to g */ + {0x0434, 0x64, 0x00, 0x00, 0x00}, /* д to d */ + {0x0435, 0x65, 0x00, 0x00, 0x00}, /* е to e */ + {0x0436, 0x7a, 0x68, 0x00, 0x00}, /* ж to zh */ + {0x0437, 0x7a, 0x00, 0x00, 0x00}, /* з to z */ + {0x0438, 0x69, 0x00, 0x00, 0x00}, /* и to i */ + {0x0439, 0x69, 0x00, 0x00, 0x00}, /* й to i */ + {0x043A, 0x6b, 0x00, 0x00, 0x00}, /* к to k */ + {0x043B, 0x6c, 0x00, 0x00, 0x00}, /* л to l */ + {0x043C, 0x6d, 0x00, 0x00, 0x00}, /* м to m */ + {0x043D, 0x6e, 0x00, 0x00, 0x00}, /* н to n */ + {0x043E, 0x6f, 0x00, 0x00, 0x00}, /* о to o */ + {0x043F, 0x70, 0x00, 0x00, 0x00}, /* п to p */ + {0x0440, 0x72, 0x00, 0x00, 0x00}, /* р to r */ + {0x0441, 0x73, 0x00, 0x00, 0x00}, /* с to s */ + {0x0442, 0x74, 0x00, 0x00, 0x00}, /* т to t */ + {0x0443, 0x75, 0x00, 0x00, 0x00}, /* у to u */ + {0x0444, 0x66, 0x00, 0x00, 0x00}, /* ф to f */ + {0x0445, 0x6b, 0x68, 0x00, 0x00}, /* х to kh */ + {0x0446, 0x74, 0x63, 0x00, 0x00}, /* ц to tc */ + {0x0447, 0x63, 0x68, 0x00, 0x00}, /* ч to ch */ + {0x0448, 0x73, 0x68, 0x00, 0x00}, /* ш to sh */ + {0x0449, 0x73, 0x68, 0x63, 0x68}, /* щ to shch */ + {0x044A, 0x61, 0x00, 0x00, 0x00}, /* to a */ + {0x044B, 0x79, 0x00, 0x00, 0x00}, /* ы to y */ + {0x044C, 0x79, 0x00, 0x00, 0x00}, /* to y */ + {0x044D, 0x65, 0x00, 0x00, 0x00}, /* э to e */ + {0x044E, 0x69, 0x75, 0x00, 0x00}, /* ю to iu */ + {0x044F, 0x69, 0x61, 0x00, 0x00}, /* я to ia */ + {0x0450, 0x65, 0x00, 0x00, 0x00}, /* ѐ to e */ + {0x0451, 0x65, 0x00, 0x00, 0x00}, /* ё to e */ + {0x0452, 0x64, 0x00, 0x00, 0x00}, /* ђ to d */ + {0x0453, 0x67, 0x00, 0x00, 0x00}, /* ѓ to g */ + {0x0454, 0x65, 0x00, 0x00, 0x00}, /* є to e */ + {0x0455, 0x7a, 0x00, 0x00, 0x00}, /* ѕ to z */ + {0x0456, 0x69, 0x00, 0x00, 0x00}, /* і to i */ + {0x0457, 0x69, 0x00, 0x00, 0x00}, /* ї to i */ + {0x0458, 0x6a, 0x00, 0x00, 0x00}, /* ј to j */ + {0x0459, 0x69, 0x00, 0x00, 0x00}, /* љ to i */ + {0x045A, 0x6e, 0x00, 0x00, 0x00}, /* њ to n */ + {0x045B, 0x64, 0x00, 0x00, 0x00}, /* ћ to d */ + {0x045C, 0x6b, 0x00, 0x00, 0x00}, /* ќ to k */ + {0x045D, 0x69, 0x00, 0x00, 0x00}, /* ѝ to i */ + {0x045E, 0x75, 0x00, 0x00, 0x00}, /* ў to u */ + {0x045F, 0x64, 0x00, 0x00, 0x00}, /* џ to d */ + {0x1E02, 0x42, 0x00, 0x00, 0x00}, /* Ḃ to B */ + {0x1E03, 0x62, 0x00, 0x00, 0x00}, /* ḃ to b */ + {0x1E0A, 0x44, 0x00, 0x00, 0x00}, /* Ḋ to D */ + {0x1E0B, 0x64, 0x00, 0x00, 0x00}, /* ḋ to d */ + {0x1E1E, 0x46, 0x00, 0x00, 0x00}, /* Ḟ to F */ + {0x1E1F, 0x66, 0x00, 0x00, 0x00}, /* ḟ to f */ + {0x1E40, 0x4D, 0x00, 0x00, 0x00}, /* Ṁ to M */ + {0x1E41, 0x6D, 0x00, 0x00, 0x00}, /* ṁ to m */ + {0x1E56, 0x50, 0x00, 0x00, 0x00}, /* Ṗ to P */ + {0x1E57, 0x70, 0x00, 0x00, 0x00}, /* ṗ to p */ + {0x1E60, 0x53, 0x00, 0x00, 0x00}, /* Ṡ to S */ + {0x1E61, 0x73, 0x00, 0x00, 0x00}, /* ṡ to s */ + {0x1E6A, 0x54, 0x00, 0x00, 0x00}, /* Ṫ to T */ + {0x1E6B, 0x74, 0x00, 0x00, 0x00}, /* ṫ to t */ + {0x1E80, 0x57, 0x00, 0x00, 0x00}, /* Ẁ to W */ + {0x1E81, 0x77, 0x00, 0x00, 0x00}, /* ẁ to w */ + {0x1E82, 0x57, 0x00, 0x00, 0x00}, /* Ẃ to W */ + {0x1E83, 0x77, 0x00, 0x00, 0x00}, /* ẃ to w */ + {0x1E84, 0x57, 0x00, 0x00, 0x00}, /* Ẅ to W */ + {0x1E85, 0x77, 0x00, 0x00, 0x00}, /* ẅ to w */ + {0x1EF2, 0x59, 0x00, 0x00, 0x00}, /* Ỳ to Y */ + {0x1EF3, 0x79, 0x00, 0x00, 0x00}, /* ỳ to y */ + {0xFB00, 0x66, 0x66, 0x00, 0x00}, /* ff to ff */ + {0xFB01, 0x66, 0x69, 0x00, 0x00}, /* fi to fi */ + {0xFB02, 0x66, 0x6C, 0x00, 0x00}, /* fl to fl */ + {0xFB05, 0x73, 0x74, 0x00, 0x00}, /* ſt to st */ + {0xFB06, 0x73, 0x74, 0x00, 0x00}, /* st to st */ +}; + +static const Transliteration* spellfixFindTranslit(int c, int* pxTop) { + *pxTop = (sizeof(translit) / sizeof(translit[0])) - 1; + return translit; +} + +/* +** Convert the input string from UTF-8 into pure ASCII by converting +** all non-ASCII characters to some combination of characters in the +** ASCII subset. +** +** The returned string might contain more characters than the input. +** +** Space to hold the returned string comes from sqlite3_malloc() and +** should be freed by the caller. +*/ +unsigned char* transliterate(const unsigned char* zIn, int nIn) { + unsigned char* zOut = malloc(nIn * 4 + 1); + int c, sz, nOut; + if (zOut == 0) + return 0; + nOut = 0; + while (nIn > 0) { + c = utf8Read(zIn, nIn, &sz); + zIn += sz; + nIn -= sz; + if (c <= 127) { + zOut[nOut++] = (unsigned char)c; + } else { + int xTop, xBtm, x; + const Transliteration* tbl = spellfixFindTranslit(c, &xTop); + xBtm = 0; + while (xTop >= xBtm) { + x = (xTop + xBtm) / 2; + if (tbl[x].cFrom == c) { + zOut[nOut++] = tbl[x].cTo0; + if (tbl[x].cTo1) { + zOut[nOut++] = tbl[x].cTo1; + if (tbl[x].cTo2) { + zOut[nOut++] = tbl[x].cTo2; + if (tbl[x].cTo3) { + zOut[nOut++] = tbl[x].cTo3; + } + } + } + c = 0; + break; + } else if (tbl[x].cFrom > c) { + xTop = x - 1; + } else { + xBtm = x + 1; + } + } + if (c) + zOut[nOut++] = '?'; + } + } + zOut[nOut] = 0; + return zOut; +} + +/* +** Return the number of characters in the shortest prefix of the input +** string that transliterates to an ASCII string nTrans bytes or longer. +** Or, if the transliteration of the input string is less than nTrans +** bytes in size, return the number of characters in the input string. +*/ +int translen_to_charlen(const char* zIn, int nIn, int nTrans) { + int i, c, sz, nOut; + int nChar; + + i = nOut = 0; + for (nChar = 0; i < nIn && nOut < nTrans; nChar++) { + c = utf8Read((const unsigned char*)&zIn[i], nIn - i, &sz); + i += sz; + + nOut++; + if (c >= 128) { + int xTop, xBtm, x; + const Transliteration* tbl = spellfixFindTranslit(c, &xTop); + xBtm = 0; + while (xTop >= xBtm) { + x = (xTop + xBtm) / 2; + if (tbl[x].cFrom == c) { + if (tbl[x].cTo1) { + nOut++; + if (tbl[x].cTo2) { + nOut++; + if (tbl[x].cTo3) { + nOut++; + } + } + } + break; + } else if (tbl[x].cFrom > c) { + xTop = x - 1; + } else { + xBtm = x + 1; + } + } + } + } + + return nChar; +} + +/* + * Try to determine the dominant script used by the word zIn of length nIn + * and return its ISO 15924 numeric code. + */ +int script_code(const unsigned char* zIn, int nIn) { + int c, sz; + int scriptMask = 0; + int res; + int seenDigit = 0; + + while (nIn > 0) { + c = utf8Read(zIn, nIn, &sz); + zIn += sz; + nIn -= sz; + if (c < 0x02af) { + if (c >= 0x80 || midClass[c & 0x7f] < CCLASS_DIGIT) { + scriptMask |= SCRIPT_LATIN; + } else if (c >= '0' && c <= '9') { + seenDigit = 1; + } + } else if (c >= 0x0400 && c <= 0x04ff) { + scriptMask |= SCRIPT_CYRILLIC; + } else if (c >= 0x0386 && c <= 0x03ce) { + scriptMask |= SCRIPT_GREEK; + } else if (c >= 0x0590 && c <= 0x05ff) { + scriptMask |= SCRIPT_HEBREW; + } else if (c >= 0x0600 && c <= 0x06ff) { + scriptMask |= SCRIPT_ARABIC; + } + } + if (scriptMask == 0 && seenDigit) + scriptMask = SCRIPT_LATIN; + switch (scriptMask) { + case 0: + res = 999; + break; + case SCRIPT_LATIN: + res = 215; + break; + case SCRIPT_CYRILLIC: + res = 220; + break; + case SCRIPT_GREEK: + res = 200; + break; + case SCRIPT_HEBREW: + res = 125; + break; + case SCRIPT_ARABIC: + res = 160; + break; + default: + res = 998; + break; + } + return res; +} diff --git a/libsql-ffi/bundled/sqlean/ipaddr/extension.c b/libsql-ffi/bundled/sqlean/ipaddr/extension.c new file mode 100644 index 0000000000..56addcabf4 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/ipaddr/extension.c @@ -0,0 +1,218 @@ +// Copyright (c) 2021 Vincent Bernat, MIT License +// https://github.com/nalgeon/sqlean + +// IP address manipulation in SQLite. + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#include +#include +#include +#endif + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +struct ipaddress { + int af; + union { + struct in6_addr ipv6; + struct in_addr ipv4; + }; + unsigned masklen; +}; + +static struct ipaddress* parse_ipaddress(const char* address) { + struct ipaddress* ip = NULL; + unsigned char buf[sizeof(struct in6_addr)]; + char* sep = strchr(address, '/'); + unsigned long masklen = 0; + if (sep) { + char* end; + errno = 0; + masklen = strtoul(sep + 1, &end, 10); + if (errno != 0 || sep + 1 == end || *end != '\0') + return NULL; + *sep = '\0'; + } + if (inet_pton(AF_INET, address, buf)) { + if (sep && masklen > 32) + goto end; + + ip = sqlite3_malloc(sizeof(struct ipaddress)); + memcpy(&ip->ipv4, buf, sizeof(struct in_addr)); + ip->af = AF_INET; + ip->masklen = sep ? masklen : 32; + } else if (inet_pton(AF_INET6, address, buf)) { + if (sep && masklen > 128) + goto end; + + ip = sqlite3_malloc(sizeof(struct ipaddress)); + memcpy(&ip->ipv6, buf, sizeof(struct in6_addr)); + ip->af = AF_INET6; + ip->masklen = sep ? masklen : 128; + } +end: + if (sep) + *sep = '/'; + return ip; +} + +static void ipaddr_ipfamily(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + const char* address = (char*)sqlite3_value_text(argv[0]); + struct ipaddress* ip = parse_ipaddress(address); + if (ip == NULL) { + sqlite3_result_null(context); + return; + } + sqlite3_result_int(context, ip->af == AF_INET ? 4 : 6); + sqlite3_free(ip); +} + +static void ipaddr_iphost(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + const char* address = (char*)sqlite3_value_text(argv[0]); + struct ipaddress* ip = parse_ipaddress(address); + if (ip == NULL) { + sqlite3_result_null(context); + return; + } + if (ip->af == AF_INET) { + char* result = sqlite3_malloc(INET_ADDRSTRLEN); + inet_ntop(AF_INET, &ip->ipv4, result, INET_ADDRSTRLEN); + sqlite3_result_text(context, result, -1, sqlite3_free); + } else if (ip->af == AF_INET6) { + char* result = sqlite3_malloc(INET6_ADDRSTRLEN); + inet_ntop(AF_INET6, &ip->ipv6, result, INET6_ADDRSTRLEN); + sqlite3_result_text(context, result, -1, sqlite3_free); + } + sqlite3_free(ip); +} + +static void ipaddr_ipmasklen(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + const char* address = (char*)sqlite3_value_text(argv[0]); + struct ipaddress* ip = parse_ipaddress(address); + if (ip == NULL) { + sqlite3_result_null(context); + return; + } + sqlite3_result_int(context, ip->masklen); + return; +} + +static void ipaddr_ipnetwork(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + const char* address = (char*)sqlite3_value_text(argv[0]); + struct ipaddress* ip = parse_ipaddress(address); + if (ip == NULL) { + sqlite3_result_null(context); + return; + } + if (ip->af == AF_INET) { + char buf[INET_ADDRSTRLEN]; + ip->ipv4.s_addr = + htonl(ntohl(ip->ipv4.s_addr) & ~(uint32_t)((1ULL << (32 - ip->masklen)) - 1)); + inet_ntop(AF_INET, &ip->ipv4, buf, INET_ADDRSTRLEN); + char* result = sqlite3_malloc(INET_ADDRSTRLEN + 3); + sprintf(result, "%s/%u", buf, ip->masklen); + sqlite3_result_text(context, result, -1, sqlite3_free); + } else if (ip->af == AF_INET6) { + char buf[INET6_ADDRSTRLEN]; + for (unsigned i = 0; i < 16; i++) { + if (ip->masklen / 8 < i) + ip->ipv6.s6_addr[i] = 0; + else if (ip->masklen / 8 == i) + ip->ipv6.s6_addr[i] &= ~(ip->masklen % 8); + } + inet_ntop(AF_INET6, &ip->ipv6, buf, INET6_ADDRSTRLEN); + char* result = sqlite3_malloc(INET6_ADDRSTRLEN + 4); + sprintf(result, "%s/%u", buf, ip->masklen); + sqlite3_result_text(context, result, -1, sqlite3_free); + } + sqlite3_free(ip); +} + +static void ipaddr_ipcontains(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + if (sqlite3_value_type(argv[0]) == SQLITE_NULL || sqlite3_value_type(argv[1]) == SQLITE_NULL) { + sqlite3_result_null(context); + return; + } + + const char* address1 = (char*)sqlite3_value_text(argv[0]); + struct ipaddress* ip1 = parse_ipaddress(address1); + const char* address2 = (char*)sqlite3_value_text(argv[1]); + struct ipaddress* ip2 = parse_ipaddress(address2); + if (ip1 == NULL || ip2 == NULL) { + sqlite3_result_null(context); + goto end; + } + if (ip1->af != ip2->af || ip1->masklen > ip2->masklen) { + sqlite3_result_int(context, 0); + goto end; + } + + if (ip1->af == AF_INET) { + ip1->ipv4.s_addr = + htonl(ntohl(ip1->ipv4.s_addr) & ~(uint32_t)((1ULL << (32 - ip1->masklen)) - 1)); + ip2->ipv4.s_addr = + htonl(ntohl(ip2->ipv4.s_addr) & ~(uint32_t)((1ULL << (32 - ip1->masklen)) - 1)); + sqlite3_result_int(context, ip1->ipv4.s_addr == ip2->ipv4.s_addr); + goto end; + } + if (ip1->af == AF_INET6) { + for (unsigned i = 0; i < 16; i++) { + if (ip1->masklen / 8 < i) { + ip1->ipv6.s6_addr[i] = 0; + ip2->ipv6.s6_addr[i] = 0; + } else if (ip1->masklen / 8 == i) { + ip1->ipv6.s6_addr[i] &= ~(ip1->masklen % 8); + ip2->ipv6.s6_addr[i] &= ~(ip1->masklen % 8); + } + if (ip1->ipv6.s6_addr[i] != ip2->ipv6.s6_addr[i]) { + sqlite3_result_int(context, 0); + goto end; + } + } + sqlite3_result_int(context, 1); + } +end: + sqlite3_free(ip1); + sqlite3_free(ip2); +} + +int ipaddr_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "ipfamily", 1, flags, 0, ipaddr_ipfamily, 0, 0); + sqlite3_create_function(db, "iphost", 1, flags, 0, ipaddr_iphost, 0, 0); + sqlite3_create_function(db, "ipmasklen", 1, flags, 0, ipaddr_ipmasklen, 0, 0); + sqlite3_create_function(db, "ipnetwork", 1, flags, 0, ipaddr_ipnetwork, 0, 0); + sqlite3_create_function(db, "ipcontains", 2, flags, 0, ipaddr_ipcontains, 0, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/ipaddr/extension.h b/libsql-ffi/bundled/sqlean/ipaddr/extension.h new file mode 100644 index 0000000000..65a51437e5 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/ipaddr/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2021 Vincent Bernat, MIT License +// https://github.com/nalgeon/sqlean + +// IP address manipulation in SQLite. + +#ifndef IPADDR_EXTENSION_H +#define IPADDR_EXTENSION_H + +#include "sqlite3ext.h" + +int ipaddr_init(sqlite3* db); + +#endif /* IPADDR_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/math/extension.c b/libsql-ffi/bundled/sqlean/math/extension.c new file mode 100644 index 0000000000..d48d65a728 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/math/extension.c @@ -0,0 +1,309 @@ +// Originally from SQLite 3.42.0 source code (func.c), Public Domain +// Updated as of 3.46.0 + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// SQLite math functions. + +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#if defined(HAVE_STDINT_H) /* Use this case if we have ANSI headers */ +#define SQLITE_PTR_TO_INT(X) ((int)(intptr_t)(X)) +#elif defined(__PTRDIFF_TYPE__) /* This case should work for GCC */ +#define SQLITE_PTR_TO_INT(X) ((int)(__PTRDIFF_TYPE__)(X)) +#elif !defined(__GNUC__) /* Works for compilers other than LLVM */ +#define SQLITE_PTR_TO_INT(X) ((int)(((char*)X) - (char*)0)) +#else /* Generates a warning - but it always works */ +#define SQLITE_PTR_TO_INT(X) ((int)(X)) +#endif + +/* Mathematical Constants */ +#ifndef M_PI +#define M_PI 3.141592653589793238462643383279502884 +#endif +#ifndef M_LN10 +#define M_LN10 2.302585092994045684017991454684364208 +#endif +#ifndef M_LN2 +#define M_LN2 0.693147180559945309417232121458176568 +#endif + +/* +** Implementation SQL functions: +** +** ceil(X) +** ceiling(X) +** floor(X) +** +** The sqlite3_user_data() pointer is a pointer to the libm implementation +** of the underlying C function. +*/ +static void ceilingFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + switch (sqlite3_value_numeric_type(argv[0])) { + case SQLITE_INTEGER: { + sqlite3_result_int64(context, sqlite3_value_int64(argv[0])); + break; + } + case SQLITE_FLOAT: { + double (*x)(double) = (double (*)(double))sqlite3_user_data(context); + sqlite3_result_double(context, x(sqlite3_value_double(argv[0]))); + break; + } + default: { + break; + } + } +} + +/* +** On some systems, ceil() and floor() are intrinsic function. You are +** unable to take a pointer to these functions. Hence, we here wrap them +** in our own actual functions. +*/ +static double xCeil(double x) { + return ceil(x); +} +static double xFloor(double x) { + return floor(x); +} + +/* +** Some systems do not have log2() and log10() in their standard math +** libraries. +*/ +#if defined(HAVE_LOG10) && HAVE_LOG10 == 0 +#define log10(X) (0.4342944819032517867 * log(X)) +#endif +#if defined(HAVE_LOG2) && HAVE_LOG2 == 0 +#define log2(X) (1.442695040888963456 * log(X)) +#endif + +/* +** Implementation of SQL functions: +** +** ln(X) - natural logarithm +** log(X) - log X base 10 +** log10(X) - log X base 10 +** log(B,X) - log X base B +*/ +static void logFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + double x, b, ans; + assert(argc == 1 || argc == 2); + switch (sqlite3_value_numeric_type(argv[0])) { + case SQLITE_INTEGER: + case SQLITE_FLOAT: + x = sqlite3_value_double(argv[0]); + if (x <= 0.0) + return; + break; + default: + return; + } + if (argc == 2) { + switch (sqlite3_value_numeric_type(argv[0])) { + case SQLITE_INTEGER: + case SQLITE_FLOAT: + b = log(x); + if (b <= 0.0) + return; + x = sqlite3_value_double(argv[1]); + if (x <= 0.0) + return; + break; + default: + return; + } + ans = log(x) / b; + } else { + switch (SQLITE_PTR_TO_INT(sqlite3_user_data(context))) { + case 1: + ans = log10(x); + break; + case 2: + ans = log2(x); + break; + default: + ans = log(x); + break; + } + } + sqlite3_result_double(context, ans); +} + +/* +** Functions to converts degrees to radians and radians to degrees. +*/ +static double degToRad(double x) { + return x * (M_PI / 180.0); +} +static double radToDeg(double x) { + return x * (180.0 / M_PI); +} + +/* +** Implementation of 1-argument SQL math functions: +** +** exp(X) - Compute e to the X-th power +*/ +static void math1Func(sqlite3_context* context, int argc, sqlite3_value** argv) { + int type0; + double v0, ans; + double (*x)(double); + assert(argc == 1); + type0 = sqlite3_value_numeric_type(argv[0]); + if (type0 != SQLITE_INTEGER && type0 != SQLITE_FLOAT) + return; + v0 = sqlite3_value_double(argv[0]); + x = (double (*)(double))sqlite3_user_data(context); + ans = x(v0); + sqlite3_result_double(context, ans); +} + +/* +** Implementation of 2-argument SQL math functions: +** +** power(X,Y) - Compute X to the Y-th power +*/ +static void math2Func(sqlite3_context* context, int argc, sqlite3_value** argv) { + int type0, type1; + double v0, v1, ans; + double (*x)(double, double); + assert(argc == 2); + type0 = sqlite3_value_numeric_type(argv[0]); + if (type0 != SQLITE_INTEGER && type0 != SQLITE_FLOAT) + return; + type1 = sqlite3_value_numeric_type(argv[1]); + if (type1 != SQLITE_INTEGER && type1 != SQLITE_FLOAT) + return; + v0 = sqlite3_value_double(argv[0]); + v1 = sqlite3_value_double(argv[1]); + x = (double (*)(double, double))sqlite3_user_data(context); + ans = x(v0, v1); + sqlite3_result_double(context, ans); +} + +/* +** Implementation of 0-argument pi() function. +*/ +static void piFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 0); + (void)argv; + sqlite3_result_double(context, M_PI); +} + +/* +** Implementation of the round() function +*/ +static void roundFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + int n = 0; + double r; + char* zBuf; + assert(argc == 1 || argc == 2); + if (argc == 2) { + if (SQLITE_NULL == sqlite3_value_type(argv[1])) + return; + n = sqlite3_value_int(argv[1]); + if (n > 30) + n = 30; + if (n < 0) + n = 0; + } + if (sqlite3_value_type(argv[0]) == SQLITE_NULL) + return; + r = sqlite3_value_double(argv[0]); + /* If Y==0 and X will fit in a 64-bit int, + ** handle the rounding directly, + ** otherwise use printf. + */ + if (r < -4503599627370496.0 || r > +4503599627370496.0) { + /* The value has no fractional part so there is nothing to round */ + } else if (n == 0) { + r = (double)((sqlite_int64)(r + (r < 0 ? -0.5 : +0.5))); + } else { + zBuf = sqlite3_mprintf("%!.*f", n, r); + if (zBuf == 0) { + sqlite3_result_error_nomem(context); + return; + } + // sqlite3AtoF(zBuf, &r, sqlite3Strlen30(zBuf), SQLITE_UTF8); + r = strtod(zBuf, NULL); + sqlite3_free(zBuf); + } + sqlite3_result_double(context, r); +} + +int math_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + + sqlite3_create_function(db, "math_round", 1, flags, 0, roundFunc, 0, 0); + sqlite3_create_function(db, "math_round", 2, flags, 0, roundFunc, 0, 0); + sqlite3_create_function(db, "math_ceil", 1, flags, xCeil, ceilingFunc, 0, 0); + sqlite3_create_function(db, "math_floor", 1, flags, xFloor, ceilingFunc, 0, 0); + sqlite3_create_function(db, "math_trunc", 1, flags, trunc, ceilingFunc, 0, 0); + sqlite3_create_function(db, "math_ln", 1, flags, 0, logFunc, 0, 0); + sqlite3_create_function(db, "math_log", 1, flags, (void*)(1), logFunc, 0, 0); + sqlite3_create_function(db, "math_log10", 1, flags, (void*)(1), logFunc, 0, 0); + sqlite3_create_function(db, "math_log2", 1, flags, (void*)(2), logFunc, 0, 0); + sqlite3_create_function(db, "math_log", 2, flags, 0, logFunc, 0, 0); + sqlite3_create_function(db, "math_exp", 1, flags, exp, math1Func, 0, 0); + sqlite3_create_function(db, "math_pow", 2, flags, pow, math2Func, 0, 0); + sqlite3_create_function(db, "math_mod", 2, flags, fmod, math2Func, 0, 0); + sqlite3_create_function(db, "math_acos", 1, flags, acos, math1Func, 0, 0); + sqlite3_create_function(db, "math_asin", 1, flags, asin, math1Func, 0, 0); + sqlite3_create_function(db, "math_atan", 1, flags, atan, math1Func, 0, 0); + sqlite3_create_function(db, "math_atan2", 2, flags, atan2, math2Func, 0, 0); + sqlite3_create_function(db, "math_cos", 1, flags, cos, math1Func, 0, 0); + sqlite3_create_function(db, "math_sin", 1, flags, sin, math1Func, 0, 0); + sqlite3_create_function(db, "math_tan", 1, flags, tan, math1Func, 0, 0); + sqlite3_create_function(db, "math_cosh", 1, flags, cosh, math1Func, 0, 0); + sqlite3_create_function(db, "math_sinh", 1, flags, sinh, math1Func, 0, 0); + sqlite3_create_function(db, "math_tanh", 1, flags, tanh, math1Func, 0, 0); + sqlite3_create_function(db, "math_acosh", 1, flags, acosh, math1Func, 0, 0); + sqlite3_create_function(db, "math_asinh", 1, flags, asinh, math1Func, 0, 0); + sqlite3_create_function(db, "math_atanh", 1, flags, atanh, math1Func, 0, 0); + sqlite3_create_function(db, "math_sqrt", 1, flags, sqrt, math1Func, 0, 0); + sqlite3_create_function(db, "math_radians", 1, flags, degToRad, math1Func, 0, 0); + sqlite3_create_function(db, "math_degrees", 1, flags, radToDeg, math1Func, 0, 0); + sqlite3_create_function(db, "math_pi", 0, flags, 0, piFunc, 0, 0); + + sqlite3_create_function(db, "ceil", 1, flags, xCeil, ceilingFunc, 0, 0); + sqlite3_create_function(db, "ceiling", 1, flags, xCeil, ceilingFunc, 0, 0); + sqlite3_create_function(db, "floor", 1, flags, xFloor, ceilingFunc, 0, 0); + sqlite3_create_function(db, "trunc", 1, flags, trunc, ceilingFunc, 0, 0); + sqlite3_create_function(db, "ln", 1, flags, 0, logFunc, 0, 0); + sqlite3_create_function(db, "log", 1, flags, (void*)(1), logFunc, 0, 0); + sqlite3_create_function(db, "log10", 1, flags, (void*)(1), logFunc, 0, 0); + sqlite3_create_function(db, "log2", 1, flags, (void*)(2), logFunc, 0, 0); + sqlite3_create_function(db, "log", 2, flags, 0, logFunc, 0, 0); + sqlite3_create_function(db, "exp", 1, flags, exp, math1Func, 0, 0); + sqlite3_create_function(db, "pow", 2, flags, pow, math2Func, 0, 0); + sqlite3_create_function(db, "power", 2, flags, pow, math2Func, 0, 0); + sqlite3_create_function(db, "mod", 2, flags, fmod, math2Func, 0, 0); + sqlite3_create_function(db, "acos", 1, flags, acos, math1Func, 0, 0); + sqlite3_create_function(db, "asin", 1, flags, asin, math1Func, 0, 0); + sqlite3_create_function(db, "atan", 1, flags, atan, math1Func, 0, 0); + sqlite3_create_function(db, "atan2", 2, flags, atan2, math2Func, 0, 0); + sqlite3_create_function(db, "cos", 1, flags, cos, math1Func, 0, 0); + sqlite3_create_function(db, "sin", 1, flags, sin, math1Func, 0, 0); + sqlite3_create_function(db, "tan", 1, flags, tan, math1Func, 0, 0); + sqlite3_create_function(db, "cosh", 1, flags, cosh, math1Func, 0, 0); + sqlite3_create_function(db, "sinh", 1, flags, sinh, math1Func, 0, 0); + sqlite3_create_function(db, "tanh", 1, flags, tanh, math1Func, 0, 0); + sqlite3_create_function(db, "acosh", 1, flags, acosh, math1Func, 0, 0); + sqlite3_create_function(db, "asinh", 1, flags, asinh, math1Func, 0, 0); + sqlite3_create_function(db, "atanh", 1, flags, atanh, math1Func, 0, 0); + sqlite3_create_function(db, "sqrt", 1, flags, sqrt, math1Func, 0, 0); + sqlite3_create_function(db, "radians", 1, flags, degToRad, math1Func, 0, 0); + sqlite3_create_function(db, "degrees", 1, flags, radToDeg, math1Func, 0, 0); + sqlite3_create_function(db, "pi", 0, flags, 0, piFunc, 0, 0); + + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/math/extension.h b/libsql-ffi/bundled/sqlean/math/extension.h new file mode 100644 index 0000000000..29ff9353d9 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/math/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite math functions. + +#ifndef MATH_EXTENSION_H +#define MATH_EXTENSION_H + +#include "sqlite3ext.h" + +int math_init(sqlite3* db); + +#endif /* MATH_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/regexp/constants.h b/libsql-ffi/bundled/sqlean/regexp/constants.h new file mode 100644 index 0000000000..b2dc91559c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/constants.h @@ -0,0 +1,18 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// PCRE2 build constants. + +#ifndef REGEXP_CONSTANTS_H +#define REGEXP_CONSTANTS_H + +#define PCRE2_CODE_UNIT_WIDTH 8 +#define LINK_SIZE 2 +#define HAVE_CONFIG_H +#define SUPPORT_UNICODE + +#if defined(_WIN32) +#define PCRE2_STATIC +#endif + +#endif /* REGEXP_CONSTANTS_H */ diff --git a/libsql-ffi/bundled/sqlean/regexp/extension.c b/libsql-ffi/bundled/sqlean/regexp/extension.c new file mode 100644 index 0000000000..133d8f1004 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/extension.c @@ -0,0 +1,346 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with regular expressions. + +/* + * regexp_like(source, pattern) + * - checks if the source string matches the pattern + * regexp_substr(source, pattern) + * - returns a substring of the source string that matches the pattern + * regexp_replace(source, pattern, replacement) + * - replaces all matching substrings with the replacement string + * + * Supports PCRE syntax, see docs/regexp.md + * + */ +#include +#include +#include +#include +#include + +#include "regexp/pcre2/pcre2.h" +#include "regexp/regexp.h" + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +/* + * Checks if the source string matches the pattern. + * regexp_statement(pattern, source) + * E.g.: + * select true where 'abc' regexp 'a.c'; + */ +static void fn_statement(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* source; + const char* pattern; + int is_match = 0; + + assert(argc == 2); + + source = (const char*)sqlite3_value_text(argv[1]); + if (!source) { + sqlite3_result_int(context, is_match); + return; + } + + pattern = (const char*)sqlite3_value_text(argv[0]); + if (!pattern) { + sqlite3_result_error(context, "missing regexp pattern", -1); + return; + } + + bool is_new_re = false; + pcre2_code* re = sqlite3_get_auxdata(context, 0); + if (re == NULL) { + re = regexp_compile(pattern); + if (re == NULL) { + char* msg = regexp_get_error(pattern); + sqlite3_result_error(context, msg, -1); + free(msg); + return; + } + is_new_re = true; + } + + int rc = regexp_like(re, source); + if (rc == -1) { + if (is_new_re) { + regexp_free(re); + } + sqlite3_result_error(context, "invalid regexp pattern", -1); + return; + } + + is_match = rc; + sqlite3_result_int(context, is_match); + + if (is_new_re) { + sqlite3_set_auxdata(context, 0, re, (void (*)(void*))regexp_free); + } +} + +/* + * Checks if the source string matches the pattern. + * regexp_like(source, pattern) + * E.g.: + * select regexp_like('abc', 'a.c'); + */ +static void fn_like(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* source; + const char* pattern; + int is_match = 0; + + assert(argc == 2); + + source = (const char*)sqlite3_value_text(argv[0]); + if (!source) { + sqlite3_result_int(context, is_match); + return; + } + + pattern = (const char*)sqlite3_value_text(argv[1]); + if (!pattern) { + sqlite3_result_error(context, "missing regexp pattern", -1); + return; + } + + bool is_new_re = false; + pcre2_code* re = sqlite3_get_auxdata(context, 1); + if (re == NULL) { + re = regexp_compile(pattern); + if (re == NULL) { + char* msg = regexp_get_error(pattern); + sqlite3_result_error(context, msg, -1); + free(msg); + return; + } + is_new_re = true; + } + + int rc = regexp_like(re, source); + if (rc == -1) { + if (is_new_re) { + regexp_free(re); + } + sqlite3_result_error(context, "invalid regexp pattern", -1); + return; + } + + is_match = rc; + sqlite3_result_int(context, is_match); + + if (is_new_re) { + sqlite3_set_auxdata(context, 1, re, (void (*)(void*))regexp_free); + } +} + +/* + * Returns a substring of the source string that matches the pattern. + * regexp_substr(source, pattern) + * E.g.: select regexp_substr('abcdef', 'b.d') = 'bcd'; + */ +static void fn_substr(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* source; + const char* pattern; + + assert(argc == 2); + + source = (const char*)sqlite3_value_text(argv[0]); + if (!source) { + return; + } + + pattern = (const char*)sqlite3_value_text(argv[1]); + if (!pattern) { + sqlite3_result_error(context, "missing regexp pattern", -1); + return; + } + + bool is_new_re = false; + pcre2_code* re = sqlite3_get_auxdata(context, 1); + if (re == NULL) { + re = regexp_compile(pattern); + if (re == NULL) { + char* msg = regexp_get_error(pattern); + sqlite3_result_error(context, msg, -1); + free(msg); + return; + } + is_new_re = true; + } + + char* matched_str; + int rc = regexp_extract(re, source, 0, &matched_str); + if (rc == -1) { + if (is_new_re) { + regexp_free(re); + } + sqlite3_result_error(context, "invalid regexp pattern", -1); + return; + } + + if (rc == 0) { + if (is_new_re) { + regexp_free(re); + } + return; + } + + sqlite3_result_text(context, matched_str, -1, SQLITE_TRANSIENT); + free(matched_str); + + if (is_new_re) { + sqlite3_set_auxdata(context, 1, re, (void (*)(void*))regexp_free); + } +} + +/* + * Finds a substring of the source string that matches the pattern + * and returns the nth matching group within that substring. + * regexp_capture(source, pattern[, n]) + * E.g.: select regexp_capture('abcdef', 'b(.)d', 1) = 'c'; + */ +static void fn_capture(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* source; + const char* pattern; + + assert(argc == 2 || argc == 3); + + source = (const char*)sqlite3_value_text(argv[0]); + if (!source) { + return; + } + + pattern = (const char*)sqlite3_value_text(argv[1]); + if (!pattern) { + sqlite3_result_error(context, "missing regexp pattern", -1); + return; + } + + size_t group_idx = 0; + if (argc == 3) { + if (sqlite3_value_type(argv[2]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "group number should be integer", -1); + return; + } + group_idx = sqlite3_value_int64(argv[2]); + } + + bool is_new_re = false; + pcre2_code* re = sqlite3_get_auxdata(context, 1); + if (re == NULL) { + re = regexp_compile(pattern); + if (re == NULL) { + char* msg = regexp_get_error(pattern); + sqlite3_result_error(context, msg, -1); + free(msg); + return; + } + is_new_re = true; + } + + char* matched_str; + int rc = regexp_extract(re, source, group_idx, &matched_str); + if (rc == -1) { + if (is_new_re) { + regexp_free(re); + } + sqlite3_result_error(context, "invalid regexp pattern", -1); + return; + } + + if (rc == 0) { + if (is_new_re) { + regexp_free(re); + } + return; + } + + sqlite3_result_text(context, matched_str, -1, SQLITE_TRANSIENT); + free(matched_str); + + if (is_new_re) { + sqlite3_set_auxdata(context, 1, re, (void (*)(void*))regexp_free); + } +} + +/* + * Replaces all matching substrings with the replacement string. + * regexp_replace(source, pattern, replacement) + * E.g.: select regexp_replace('abcdef', 'b.d', '...') = 'a...ef'; + */ +static void fn_replace(sqlite3_context* context, int argc, sqlite3_value** argv) { + const char* source; + const char* pattern; + const char* replacement; + char* result; + + assert(argc == 3); + + source = (char*)sqlite3_value_text(argv[0]); + if (!source) { + return; + } + + pattern = (char*)sqlite3_value_text(argv[1]); + if (!pattern) { + sqlite3_result_error(context, "missing regexp pattern", -1); + return; + } + + replacement = (char*)sqlite3_value_text(argv[2]); + if (!replacement) { + sqlite3_result_value(context, argv[0]); + return; + } + + bool is_new_re = false; + pcre2_code* re = sqlite3_get_auxdata(context, 1); + if (re == NULL) { + re = regexp_compile(pattern); + if (re == NULL) { + char* msg = regexp_get_error(pattern); + sqlite3_result_error(context, msg, -1); + free(msg); + return; + } + is_new_re = true; + } + + int rc = regexp_replace(re, source, replacement, &result); + if (rc == -1) { + if (is_new_re) { + regexp_free(re); + } + sqlite3_result_error(context, "invalid regexp pattern", -1); + return; + } + + if (rc == 0) { + if (is_new_re) { + regexp_free(re); + } + sqlite3_result_value(context, argv[0]); + return; + } + + sqlite3_result_text(context, result, -1, SQLITE_TRANSIENT); + free(result); + + if (is_new_re) { + sqlite3_set_auxdata(context, 1, re, (void (*)(void*))regexp_free); + } +} + +int regexp_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "regexp", 2, flags, 0, fn_statement, 0, 0); + sqlite3_create_function(db, "regexp_like", 2, flags, 0, fn_like, 0, 0); + sqlite3_create_function(db, "regexp_substr", 2, flags, 0, fn_substr, 0, 0); + sqlite3_create_function(db, "regexp_capture", 2, flags, 0, fn_capture, 0, 0); + sqlite3_create_function(db, "regexp_capture", 3, flags, 0, fn_capture, 0, 0); + sqlite3_create_function(db, "regexp_replace", 3, flags, 0, fn_replace, 0, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/regexp/extension.h b/libsql-ffi/bundled/sqlean/regexp/extension.h new file mode 100644 index 0000000000..05f70f397e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with regular expressions. + +#ifndef REGEXP_EXTENSION_H +#define REGEXP_EXTENSION_H + +#include "sqlite3ext.h" + +int regexp_init(sqlite3* db); + +#endif /* REGEXP_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/LICENSE b/libsql-ffi/bundled/sqlean/regexp/pcre2/LICENSE new file mode 100644 index 0000000000..eddd38e3bb --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/LICENSE @@ -0,0 +1,83 @@ +## PCRE2 LICENCE + +PCRE2 is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + +Releases 10.00 and above of PCRE2 are distributed under the terms of the "BSD" +licence, as specified below, with one exemption for certain binary +redistributions. The documentation for PCRE2, supplied in the "doc" directory, +is distributed under the same terms as the software itself. The data in the +testdata directory is not copyrighted and is in the public domain. + +The basic library functions are written in C and are freestanding. Also +included in the distribution is a just-in-time compiler that can be used to +optimize pattern matching. This is an optional feature that can be omitted when +the library is built. + +## THE BASIC LIBRARY FUNCTIONS + +Written by: Philip Hazel +Email local part: Philip.Hazel +Email domain: gmail.com + +Retired from University of Cambridge Computing Service, +Cambridge, England. + +Copyright (c) 1997-2022 University of Cambridge +All rights reserved. + +## PCRE2 JUST-IN-TIME COMPILATION SUPPORT + +Written by: Zoltan Herczeg +Email local part: hzmester +Email domain: freemail.hu + +Copyright(c) 2010-2022 Zoltan Herczeg +All rights reserved. + +## STACK-LESS JUST-IN-TIME COMPILER + +Written by: Zoltan Herczeg +Email local part: hzmester +Email domain: freemail.hu + +Copyright(c) 2009-2022 Zoltan Herczeg +All rights reserved. + +## THE "BSD" LICENCE + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notices, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notices, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of any + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +## EXEMPTION FOR BINARY LIBRARY-LIKE PACKAGES + +The second condition in the BSD licence (covering binary redistributions) does +not apply all the way down a chain of software. If binary package A includes +PCRE2, it must respect the condition, but if package B is software that +includes package A, the condition is not imposed on package B unless it uses +PCRE2 independently. + +End diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/README.md b/libsql-ffi/bundled/sqlean/regexp/pcre2/README.md new file mode 100644 index 0000000000..f872691ed5 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/README.md @@ -0,0 +1 @@ +Extracted from the [PCRE2-10.42](https://github.com/PCRE2Project/pcre2/releases/tag/pcre2-10.42) release. diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/config.h b/libsql-ffi/bundled/sqlean/regexp/pcre2/config.h new file mode 100644 index 0000000000..5548d18eb2 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/config.h @@ -0,0 +1,457 @@ +/* src/config.h. Generated from config.h.in by configure. */ +/* src/config.h.in. Generated from configure.ac by autoheader. */ + +/* PCRE2 is written in Standard C, but there are a few non-standard things it +can cope with, allowing it to run on SunOS4 and other "close to standard" +systems. + +In environments that support the GNU autotools, config.h.in is converted into +config.h by the "configure" script. In environments that use CMake, +config-cmake.in is converted into config.h. If you are going to build PCRE2 "by +hand" without using "configure" or CMake, you should copy the distributed +config.h.generic to config.h, and edit the macro definitions to be the way you +need them. You must then add -DHAVE_CONFIG_H to all of your compile commands, +so that config.h is included at the start of every source. + +Alternatively, you can avoid editing by using -D on the compiler command line +to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H, +but if you do, default values will be taken from config.h for non-boolean +macros that are not defined on the command line. + +Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be +defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All +such macros are listed as a commented #undef in config.h.generic. Macros such +as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are +surrounded by #ifndef/#endif lines so that the value can be overridden by -D. + +PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if +HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make +sure both macros are undefined; an emulation function will then be used. */ + +/* By default, the \R escape sequence matches any Unicode line ending + character or sequence of characters. If BSR_ANYCRLF is defined (to any + value), this is changed so that backslash-R matches only CR, LF, or CRLF. + The build-time default can be overridden by the user of PCRE2 at runtime. + */ +/* #undef BSR_ANYCRLF */ + +/* Define to any value to disable the use of the z and t modifiers in + formatting settings such as %zu or %td (this is rarely needed). */ +/* #undef DISABLE_PERCENT_ZT */ + +/* If you are compiling for a system that uses EBCDIC instead of ASCII + character codes, define this macro to any value. When EBCDIC is set, PCRE2 + assumes that all input strings are in EBCDIC. If you do not define this + macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It + is not possible to build a version of PCRE2 that supports both EBCDIC and + UTF-8/16/32. */ +/* #undef EBCDIC */ + +/* In an EBCDIC environment, define this macro to any value to arrange for the + NL character to be 0x25 instead of the default 0x15. NL plays the role that + LF does in an ASCII/Unicode environment. */ +/* #undef EBCDIC_NL25 */ + +/* Define this if your compiler supports __attribute__((uninitialized)) */ +/* #undef HAVE_ATTRIBUTE_UNINITIALIZED */ + +/* Define to 1 if you have the `bcopy' function. */ +/* #undef HAVE_BCOPY */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_BZLIB_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_DIRENT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_DLFCN_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_EDITLINE_READLINE_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_EDIT_READLINE_READLINE_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INTTYPES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LIMITS_H */ + +/* Define to 1 if you have the `memfd_create' function. */ +/* #undef HAVE_MEMFD_CREATE */ + +/* Define to 1 if you have the `memmove' function. */ +/* #undef HAVE_MEMMOVE */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MINIX_CONFIG_H */ + +/* Define to 1 if you have the `mkostemp' function. */ +/* #undef HAVE_MKOSTEMP */ + +/* Define if you have POSIX threads libraries and header files. */ +/* #undef HAVE_PTHREAD */ + +/* Have PTHREAD_PRIO_INHERIT. */ +/* #undef HAVE_PTHREAD_PRIO_INHERIT */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_READLINE_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_READLINE_HISTORY_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_READLINE_READLINE_H */ + +/* Define to 1 if you have the `realpath' function. */ +/* #undef HAVE_REALPATH */ + +/* Define to 1 if you have the `secure_getenv' function. */ +/* #undef HAVE_SECURE_GETENV */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STDINT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STDIO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STDLIB_H */ + +/* Define to 1 if you have the `strerror' function. */ +/* #undef HAVE_STRERROR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STRINGS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_STRING_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_STAT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_TYPES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_WAIT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_UNISTD_H */ + +/* Define to 1 if the compiler supports simple visibility declarations. */ +/* #undef HAVE_VISIBILITY */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WCHAR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINDOWS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_ZLIB_H */ + +/* This limits the amount of memory that may be used while matching a pattern. + It applies to both pcre2_match() and pcre2_dfa_match(). It does not apply + to JIT matching. The value is in kibibytes (units of 1024 bytes). */ +#ifndef HEAP_LIMIT +#define HEAP_LIMIT 20000000 +#endif + +/* The value of LINK_SIZE determines the number of bytes used to store links + as offsets within the compiled regex. The default is 2, which allows for + compiled patterns up to 65535 code units long. This covers the vast + majority of cases. However, PCRE2 can also be compiled to use 3 or 4 bytes + instead. This allows for longer patterns in extreme cases. */ +#ifndef LINK_SIZE +#define LINK_SIZE 2 +#endif + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +/* This is ignored unless you are using libtool. */ +#ifndef LT_OBJDIR +#define LT_OBJDIR ".libs/" +#endif + +/* The value of MATCH_LIMIT determines the default number of times the + pcre2_match() function can record a backtrack position during a single + matching attempt. The value is also used to limit a loop counter in + pcre2_dfa_match(). There is a runtime interface for setting a different + limit. The limit exists in order to catch runaway regular expressions that + take for ever to determine that they do not match. The default is set very + large so that it does not accidentally catch legitimate cases. */ +#ifndef MATCH_LIMIT +#define MATCH_LIMIT 10000000 +#endif + +/* The above limit applies to all backtracks, whether or not they are nested. + In some environments it is desirable to limit the nesting of backtracking + (that is, the depth of tree that is searched) more strictly, in order to + restrict the maximum amount of heap memory that is used. The value of + MATCH_LIMIT_DEPTH provides this facility. To have any useful effect, it + must be less than the value of MATCH_LIMIT. The default is to use the same + value as MATCH_LIMIT. There is a runtime method for setting a different + limit. In the case of pcre2_dfa_match(), this limit controls the depth of + the internal nested function calls that are used for pattern recursions, + lookarounds, and atomic groups. */ +#ifndef MATCH_LIMIT_DEPTH +#define MATCH_LIMIT_DEPTH MATCH_LIMIT +#endif + +/* This limit is parameterized just in case anybody ever wants to change it. + Care must be taken if it is increased, because it guards against integer + overflow caused by enormously large patterns. */ +#ifndef MAX_NAME_COUNT +#define MAX_NAME_COUNT 10000 +#endif + +/* This limit is parameterized just in case anybody ever wants to change it. + Care must be taken if it is increased, because it guards against integer + overflow caused by enormously large patterns. */ +#ifndef MAX_NAME_SIZE +#define MAX_NAME_SIZE 32 +#endif + +/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */ +/* #undef NEVER_BACKSLASH_C */ + +/* The value of NEWLINE_DEFAULT determines the default newline character + sequence. PCRE2 client programs can override this by selecting other values + at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), 5 + (ANYCRLF), and 6 (NUL). */ +#ifndef NEWLINE_DEFAULT +#define NEWLINE_DEFAULT 2 +#endif + +/* Name of package */ +#define PACKAGE "pcre2" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "PCRE2" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "PCRE2 10.42" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "pcre2" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "10.42" + +/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested + parentheses (of any kind) in a pattern. This limits the amount of system + stack that is used while compiling a pattern. */ +#ifndef PARENS_NEST_LIMIT +#define PARENS_NEST_LIMIT 250 +#endif + +/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by + pcre2grep to hold parts of the file it is searching. The buffer will be + expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing + very long lines. The actual amount of memory used by pcre2grep is three + times this number, because it allows for the buffering of "before" and + "after" lines. */ +#ifndef PCRE2GREP_BUFSIZE +#define PCRE2GREP_BUFSIZE 20480 +#endif + +/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer + used by pcre2grep to hold parts of the file it is searching. The actual + amount of memory used by pcre2grep is three times this number, because it + allows for the buffering of "before" and "after" lines. */ +#ifndef PCRE2GREP_MAX_BUFSIZE +#define PCRE2GREP_MAX_BUFSIZE 1048576 +#endif + +/* Define to any value to include debugging code. */ +/* #undef PCRE2_DEBUG */ + +/* If you are compiling for a system other than a Unix-like system or + Win32, and it needs some magic to be inserted before the definition + of a function that is exported by the library, define this macro to + contain the relevant magic. If you do not define this macro, a suitable + __declspec value is used for Windows systems; in other environments + "extern" is used for a C compiler and "extern C" for a C++ compiler. + This macro apears at the start of every exported function that is part + of the external API. It does not appear on functions that are "external" + in the C sense, but which are internal to the library. */ +/* #undef PCRE2_EXP_DEFN */ + +/* Define to any value if linking statically (TODO: make nice with Libtool) */ +/* #undef PCRE2_STATIC */ + +/* Define to necessary symbol if this constant uses a non-standard name on + your system. */ +/* #undef PTHREAD_CREATE_JOINABLE */ + +/* Define to any non-zero number to enable support for SELinux compatible + executable memory allocator in JIT. Note that this will have no effect + unless SUPPORT_JIT is also defined. */ +/* #undef SLJIT_PROT_EXECUTABLE_ALLOCATOR */ + +/* Define to 1 if all of the C90 standard headers exist (not just the ones + required in a freestanding environment). This macro is provided for + backward compatibility; new code need not use it. */ +/* #undef STDC_HEADERS */ + +/* Define to any value to enable support for Just-In-Time compiling. */ +/* #undef SUPPORT_JIT */ + +/* Define to any value to allow pcre2grep to be linked with libbz2, so that it + is able to handle .bz2 files. */ +/* #undef SUPPORT_LIBBZ2 */ + +/* Define to any value to allow pcre2test to be linked with libedit. */ +/* #undef SUPPORT_LIBEDIT */ + +/* Define to any value to allow pcre2test to be linked with libreadline. */ +/* #undef SUPPORT_LIBREADLINE */ + +/* Define to any value to allow pcre2grep to be linked with libz, so that it + is able to handle .gz files. */ +/* #undef SUPPORT_LIBZ */ + +/* Define to any value to enable callout script support in pcre2grep. */ +/* #undef SUPPORT_PCRE2GREP_CALLOUT */ + +/* Define to any value to enable fork support in pcre2grep callout scripts. + This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also defined. + */ +/* #undef SUPPORT_PCRE2GREP_CALLOUT_FORK */ + +/* Define to any value to enable JIT support in pcre2grep. Note that this will + have no effect unless SUPPORT_JIT is also defined. */ +/* #undef SUPPORT_PCRE2GREP_JIT */ + +/* Define to any value to enable the 16 bit PCRE2 library. */ +/* #undef SUPPORT_PCRE2_16 */ + +/* Define to any value to enable the 32 bit PCRE2 library. */ +/* #undef SUPPORT_PCRE2_32 */ + +/* Define to any value to enable the 8 bit PCRE2 library. */ +/* #undef SUPPORT_PCRE2_8 */ + +/* Define to any value to enable support for Unicode and UTF encoding. This + will work even in an EBCDIC environment, but it is incompatible with the + EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or* + ASCII/Unicode, but not both at once. */ +/* #undef SUPPORT_UNICODE */ + +/* Define to any value for valgrind support to find invalid memory reads. */ +/* #undef SUPPORT_VALGRIND */ + +/* Enable extensions on AIX 3, Interix. */ +#ifndef _ALL_SOURCE +# define _ALL_SOURCE 1 +#endif +/* Enable general extensions on macOS. */ +#ifndef _DARWIN_C_SOURCE +# define _DARWIN_C_SOURCE 1 +#endif +/* Enable general extensions on Solaris. */ +#ifndef __EXTENSIONS__ +# define __EXTENSIONS__ 1 +#endif +/* Enable GNU extensions on systems that have them. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif +/* Enable X/Open compliant socket functions that do not require linking + with -lxnet on HP-UX 11.11. */ +#ifndef _HPUX_ALT_XOPEN_SOCKET_API +# define _HPUX_ALT_XOPEN_SOCKET_API 1 +#endif +/* Identify the host operating system as Minix. + This macro does not affect the system headers' behavior. + A future release of Autoconf may stop defining this macro. */ +#ifndef _MINIX +/* # undef _MINIX */ +#endif +/* Enable general extensions on NetBSD. + Enable NetBSD compatibility extensions on Minix. */ +#ifndef _NETBSD_SOURCE +# define _NETBSD_SOURCE 1 +#endif +/* Enable OpenBSD compatibility extensions on NetBSD. + Oddly enough, this does nothing on OpenBSD. */ +#ifndef _OPENBSD_SOURCE +# define _OPENBSD_SOURCE 1 +#endif +/* Define to 1 if needed for POSIX-compatible behavior. */ +#ifndef _POSIX_SOURCE +/* # undef _POSIX_SOURCE */ +#endif +/* Define to 2 if needed for POSIX-compatible behavior. */ +#ifndef _POSIX_1_SOURCE +/* # undef _POSIX_1_SOURCE */ +#endif +/* Enable POSIX-compatible threading on Solaris. */ +#ifndef _POSIX_PTHREAD_SEMANTICS +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif +/* Enable extensions specified by ISO/IEC TS 18661-5:2014. */ +#ifndef __STDC_WANT_IEC_60559_ATTRIBS_EXT__ +# define __STDC_WANT_IEC_60559_ATTRIBS_EXT__ 1 +#endif +/* Enable extensions specified by ISO/IEC TS 18661-1:2014. */ +#ifndef __STDC_WANT_IEC_60559_BFP_EXT__ +# define __STDC_WANT_IEC_60559_BFP_EXT__ 1 +#endif +/* Enable extensions specified by ISO/IEC TS 18661-2:2015. */ +#ifndef __STDC_WANT_IEC_60559_DFP_EXT__ +# define __STDC_WANT_IEC_60559_DFP_EXT__ 1 +#endif +/* Enable extensions specified by ISO/IEC TS 18661-4:2015. */ +#ifndef __STDC_WANT_IEC_60559_FUNCS_EXT__ +# define __STDC_WANT_IEC_60559_FUNCS_EXT__ 1 +#endif +/* Enable extensions specified by ISO/IEC TS 18661-3:2015. */ +#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__ +# define __STDC_WANT_IEC_60559_TYPES_EXT__ 1 +#endif +/* Enable extensions specified by ISO/IEC TR 24731-2:2010. */ +#ifndef __STDC_WANT_LIB_EXT2__ +# define __STDC_WANT_LIB_EXT2__ 1 +#endif +/* Enable extensions specified by ISO/IEC 24747:2009. */ +#ifndef __STDC_WANT_MATH_SPEC_FUNCS__ +# define __STDC_WANT_MATH_SPEC_FUNCS__ 1 +#endif +/* Enable extensions on HP NonStop. */ +#ifndef _TANDEM_SOURCE +# define _TANDEM_SOURCE 1 +#endif +/* Enable X/Open extensions. Define to 500 only if necessary + to make mbstate_t available. */ +#ifndef _XOPEN_SOURCE +/* # undef _XOPEN_SOURCE */ +#endif + +/* Version number of package */ +#define VERSION "10.42" + +/* Number of bits in a file offset, on hosts where this is settable. */ +/* #undef _FILE_OFFSET_BITS */ + +/* Define for large files, on AIX-style hosts. */ +/* #undef _LARGE_FILES */ + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to the type of a signed integer type of width exactly 64 bits if + such a type exists and the standard includes do not define it. */ +/* #undef int64_t */ + +/* Define to `unsigned int' if does not define. */ +/* #undef size_t */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2.h b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2.h new file mode 100644 index 0000000000..67fda24c5e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2.h @@ -0,0 +1,998 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* This is the public header file for the PCRE library, second API, to be +#included by applications that call PCRE2 functions. + + Copyright (c) 2016-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +#ifndef PCRE2_H_IDEMPOTENT_GUARD +#define PCRE2_H_IDEMPOTENT_GUARD + +/* The current PCRE version information. */ + +#define PCRE2_MAJOR 10 +#define PCRE2_MINOR 42 +#define PCRE2_PRERELEASE +#define PCRE2_DATE 2022-12-11 + +/* When an application links to a PCRE DLL in Windows, the symbols that are +imported have to be identified as such. When building PCRE2, the appropriate +export setting is defined in pcre2_internal.h, which includes this file. So we +don't change existing definitions of PCRE2_EXP_DECL. */ + +#if defined(_WIN32) && !defined(PCRE2_STATIC) +# ifndef PCRE2_EXP_DECL +# define PCRE2_EXP_DECL extern __declspec(dllimport) +# endif +#endif + +/* By default, we use the standard "extern" declarations. */ + +#ifndef PCRE2_EXP_DECL +# ifdef __cplusplus +# define PCRE2_EXP_DECL extern "C" +# else +# define PCRE2_EXP_DECL extern +# endif +#endif + +/* When compiling with the MSVC compiler, it is sometimes necessary to include +a "calling convention" before exported function names. (This is secondhand +information; I know nothing about MSVC myself). For example, something like + + void __cdecl function(....) + +might be needed. In order so make this easy, all the exported functions have +PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not +set, we ensure here that it has no effect. */ + +#ifndef PCRE2_CALL_CONVENTION +#define PCRE2_CALL_CONVENTION +#endif + +/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and +uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do +not have stdint.h, which is why we use inttypes.h, which according to the C +standard is a superset of stdint.h. If inttypes.h is not available the build +will break and the relevant values must be provided by some other means. */ + +#include +#include +#include + +/* Allow for C++ users compiling this directly. */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* The following option bits can be passed to pcre2_compile(), pcre2_match(), +or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it +is passed. Put these bits at the most significant end of the options word so +others can be added next to them */ + +#define PCRE2_ANCHORED 0x80000000u +#define PCRE2_NO_UTF_CHECK 0x40000000u +#define PCRE2_ENDANCHORED 0x20000000u + +/* The following option bits can be passed only to pcre2_compile(). However, +they may affect compilation, JIT compilation, and/or interpretive execution. +The following tags indicate which: + +C alters what is compiled by pcre2_compile() +J alters what is compiled by pcre2_jit_compile() +M is inspected during pcre2_match() execution +D is inspected during pcre2_dfa_match() execution +*/ + +#define PCRE2_ALLOW_EMPTY_CLASS 0x00000001u /* C */ +#define PCRE2_ALT_BSUX 0x00000002u /* C */ +#define PCRE2_AUTO_CALLOUT 0x00000004u /* C */ +#define PCRE2_CASELESS 0x00000008u /* C */ +#define PCRE2_DOLLAR_ENDONLY 0x00000010u /* J M D */ +#define PCRE2_DOTALL 0x00000020u /* C */ +#define PCRE2_DUPNAMES 0x00000040u /* C */ +#define PCRE2_EXTENDED 0x00000080u /* C */ +#define PCRE2_FIRSTLINE 0x00000100u /* J M D */ +#define PCRE2_MATCH_UNSET_BACKREF 0x00000200u /* C J M */ +#define PCRE2_MULTILINE 0x00000400u /* C */ +#define PCRE2_NEVER_UCP 0x00000800u /* C */ +#define PCRE2_NEVER_UTF 0x00001000u /* C */ +#define PCRE2_NO_AUTO_CAPTURE 0x00002000u /* C */ +#define PCRE2_NO_AUTO_POSSESS 0x00004000u /* C */ +#define PCRE2_NO_DOTSTAR_ANCHOR 0x00008000u /* C */ +#define PCRE2_NO_START_OPTIMIZE 0x00010000u /* J M D */ +#define PCRE2_UCP 0x00020000u /* C J M D */ +#define PCRE2_UNGREEDY 0x00040000u /* C */ +#define PCRE2_UTF 0x00080000u /* C J M D */ +#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ +#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */ +#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ +#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ +#define PCRE2_EXTENDED_MORE 0x01000000u /* C */ +#define PCRE2_LITERAL 0x02000000u /* C */ +#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ + +/* An additional compile options word is available in the compile context. */ + +#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ +#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ +#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ +#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ +#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ +#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */ +#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */ +#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */ +#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */ +#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */ +#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ +#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ + +/* These are for pcre2_jit_compile(). */ + +#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */ +#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u +#define PCRE2_JIT_PARTIAL_HARD 0x00000004u +#define PCRE2_JIT_INVALID_UTF 0x00000100u + +/* These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and +pcre2_substitute(). Some are allowed only for one of the functions, and in +these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and +PCRE2_NO_UTF_CHECK can also be passed to these functions (though +pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */ + +#define PCRE2_NOTBOL 0x00000001u +#define PCRE2_NOTEOL 0x00000002u +#define PCRE2_NOTEMPTY 0x00000004u /* ) These two must be kept */ +#define PCRE2_NOTEMPTY_ATSTART 0x00000008u /* ) adjacent to each other. */ +#define PCRE2_PARTIAL_SOFT 0x00000010u +#define PCRE2_PARTIAL_HARD 0x00000020u +#define PCRE2_DFA_RESTART 0x00000040u /* pcre2_dfa_match() only */ +#define PCRE2_DFA_SHORTEST 0x00000080u /* pcre2_dfa_match() only */ +#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u /* pcre2_substitute() only */ +#define PCRE2_SUBSTITUTE_EXTENDED 0x00000200u /* pcre2_substitute() only */ +#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u /* pcre2_substitute() only */ +#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u /* pcre2_substitute() only */ +#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */ +#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */ +#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u +#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */ +#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */ +#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */ + +/* Options for pcre2_pattern_convert(). */ + +#define PCRE2_CONVERT_UTF 0x00000001u +#define PCRE2_CONVERT_NO_UTF_CHECK 0x00000002u +#define PCRE2_CONVERT_POSIX_BASIC 0x00000004u +#define PCRE2_CONVERT_POSIX_EXTENDED 0x00000008u +#define PCRE2_CONVERT_GLOB 0x00000010u +#define PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR 0x00000030u +#define PCRE2_CONVERT_GLOB_NO_STARSTAR 0x00000050u + +/* Newline and \R settings, for use in compile contexts. The newline values +must be kept in step with values set in config.h and both sets must all be +greater than zero. */ + +#define PCRE2_NEWLINE_CR 1 +#define PCRE2_NEWLINE_LF 2 +#define PCRE2_NEWLINE_CRLF 3 +#define PCRE2_NEWLINE_ANY 4 +#define PCRE2_NEWLINE_ANYCRLF 5 +#define PCRE2_NEWLINE_NUL 6 + +#define PCRE2_BSR_UNICODE 1 +#define PCRE2_BSR_ANYCRLF 2 + +/* Error codes for pcre2_compile(). Some of these are also used by +pcre2_pattern_convert(). */ + +#define PCRE2_ERROR_END_BACKSLASH 101 +#define PCRE2_ERROR_END_BACKSLASH_C 102 +#define PCRE2_ERROR_UNKNOWN_ESCAPE 103 +#define PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER 104 +#define PCRE2_ERROR_QUANTIFIER_TOO_BIG 105 +#define PCRE2_ERROR_MISSING_SQUARE_BRACKET 106 +#define PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS 107 +#define PCRE2_ERROR_CLASS_RANGE_ORDER 108 +#define PCRE2_ERROR_QUANTIFIER_INVALID 109 +#define PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT 110 +#define PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY 111 +#define PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS 112 +#define PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING 113 +#define PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS 114 +#define PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE 115 +#define PCRE2_ERROR_NULL_PATTERN 116 +#define PCRE2_ERROR_BAD_OPTIONS 117 +#define PCRE2_ERROR_MISSING_COMMENT_CLOSING 118 +#define PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP 119 +#define PCRE2_ERROR_PATTERN_TOO_LARGE 120 +#define PCRE2_ERROR_HEAP_FAILED 121 +#define PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS 122 +#define PCRE2_ERROR_INTERNAL_CODE_OVERFLOW 123 +#define PCRE2_ERROR_MISSING_CONDITION_CLOSING 124 +#define PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH 125 +#define PCRE2_ERROR_ZERO_RELATIVE_REFERENCE 126 +#define PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES 127 +#define PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED 128 +#define PCRE2_ERROR_BAD_RELATIVE_REFERENCE 129 +#define PCRE2_ERROR_UNKNOWN_POSIX_CLASS 130 +#define PCRE2_ERROR_INTERNAL_STUDY_ERROR 131 +#define PCRE2_ERROR_UNICODE_NOT_SUPPORTED 132 +#define PCRE2_ERROR_PARENTHESES_STACK_CHECK 133 +#define PCRE2_ERROR_CODE_POINT_TOO_BIG 134 +#define PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED 135 +#define PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C 136 +#define PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE 137 +#define PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG 138 +#define PCRE2_ERROR_MISSING_CALLOUT_CLOSING 139 +#define PCRE2_ERROR_ESCAPE_INVALID_IN_VERB 140 +#define PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P 141 +#define PCRE2_ERROR_MISSING_NAME_TERMINATOR 142 +#define PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME 143 +#define PCRE2_ERROR_INVALID_SUBPATTERN_NAME 144 +#define PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE 145 +#define PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY 146 +#define PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY 147 +#define PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG 148 +#define PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS 149 +#define PCRE2_ERROR_CLASS_INVALID_RANGE 150 +#define PCRE2_ERROR_OCTAL_BYTE_TOO_BIG 151 +#define PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE 152 +#define PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN 153 +#define PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES 154 +#define PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE 155 +#define PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE 156 +#define PCRE2_ERROR_BACKSLASH_G_SYNTAX 157 +#define PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING 158 +/* Error 159 is obsolete and should now never occur */ +#define PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED 159 +#define PCRE2_ERROR_VERB_UNKNOWN 160 +#define PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG 161 +#define PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED 162 +#define PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW 163 +#define PCRE2_ERROR_INVALID_OCTAL 164 +#define PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH 165 +#define PCRE2_ERROR_MARK_MISSING_ARGUMENT 166 +#define PCRE2_ERROR_INVALID_HEXADECIMAL 167 +#define PCRE2_ERROR_BACKSLASH_C_SYNTAX 168 +#define PCRE2_ERROR_BACKSLASH_K_SYNTAX 169 +#define PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS 170 +#define PCRE2_ERROR_BACKSLASH_N_IN_CLASS 171 +#define PCRE2_ERROR_CALLOUT_STRING_TOO_LONG 172 +#define PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT 173 +#define PCRE2_ERROR_UTF_IS_DISABLED 174 +#define PCRE2_ERROR_UCP_IS_DISABLED 175 +#define PCRE2_ERROR_VERB_NAME_TOO_LONG 176 +#define PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG 177 +#define PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS 178 +#define PCRE2_ERROR_VERSION_CONDITION_SYNTAX 179 +#define PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS 180 +#define PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER 181 +#define PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER 182 +#define PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED 183 +#define PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP 184 +#define PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED 185 +#define PCRE2_ERROR_PATTERN_TOO_COMPLICATED 186 +#define PCRE2_ERROR_LOOKBEHIND_TOO_LONG 187 +#define PCRE2_ERROR_PATTERN_STRING_TOO_LONG 188 +#define PCRE2_ERROR_INTERNAL_BAD_CODE 189 +#define PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP 190 +#define PCRE2_ERROR_NO_SURROGATES_IN_UTF16 191 +#define PCRE2_ERROR_BAD_LITERAL_OPTIONS 192 +#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193 +#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194 +#define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195 +#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196 +#define PCRE2_ERROR_TOO_MANY_CAPTURES 197 +#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198 +#define PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND 199 + + +/* "Expected" matching error codes: no match and partial match. */ + +#define PCRE2_ERROR_NOMATCH (-1) +#define PCRE2_ERROR_PARTIAL (-2) + +/* Error codes for UTF-8 validity checks */ + +#define PCRE2_ERROR_UTF8_ERR1 (-3) +#define PCRE2_ERROR_UTF8_ERR2 (-4) +#define PCRE2_ERROR_UTF8_ERR3 (-5) +#define PCRE2_ERROR_UTF8_ERR4 (-6) +#define PCRE2_ERROR_UTF8_ERR5 (-7) +#define PCRE2_ERROR_UTF8_ERR6 (-8) +#define PCRE2_ERROR_UTF8_ERR7 (-9) +#define PCRE2_ERROR_UTF8_ERR8 (-10) +#define PCRE2_ERROR_UTF8_ERR9 (-11) +#define PCRE2_ERROR_UTF8_ERR10 (-12) +#define PCRE2_ERROR_UTF8_ERR11 (-13) +#define PCRE2_ERROR_UTF8_ERR12 (-14) +#define PCRE2_ERROR_UTF8_ERR13 (-15) +#define PCRE2_ERROR_UTF8_ERR14 (-16) +#define PCRE2_ERROR_UTF8_ERR15 (-17) +#define PCRE2_ERROR_UTF8_ERR16 (-18) +#define PCRE2_ERROR_UTF8_ERR17 (-19) +#define PCRE2_ERROR_UTF8_ERR18 (-20) +#define PCRE2_ERROR_UTF8_ERR19 (-21) +#define PCRE2_ERROR_UTF8_ERR20 (-22) +#define PCRE2_ERROR_UTF8_ERR21 (-23) + +/* Error codes for UTF-16 validity checks */ + +#define PCRE2_ERROR_UTF16_ERR1 (-24) +#define PCRE2_ERROR_UTF16_ERR2 (-25) +#define PCRE2_ERROR_UTF16_ERR3 (-26) + +/* Error codes for UTF-32 validity checks */ + +#define PCRE2_ERROR_UTF32_ERR1 (-27) +#define PCRE2_ERROR_UTF32_ERR2 (-28) + +/* Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction +functions, context functions, and serializing functions. They are in numerical +order. Originally they were in alphabetical order too, but now that PCRE2 is +released, the numbers must not be changed. */ + +#define PCRE2_ERROR_BADDATA (-29) +#define PCRE2_ERROR_MIXEDTABLES (-30) /* Name was changed */ +#define PCRE2_ERROR_BADMAGIC (-31) +#define PCRE2_ERROR_BADMODE (-32) +#define PCRE2_ERROR_BADOFFSET (-33) +#define PCRE2_ERROR_BADOPTION (-34) +#define PCRE2_ERROR_BADREPLACEMENT (-35) +#define PCRE2_ERROR_BADUTFOFFSET (-36) +#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */ +#define PCRE2_ERROR_DFA_BADRESTART (-38) +#define PCRE2_ERROR_DFA_RECURSE (-39) +#define PCRE2_ERROR_DFA_UCOND (-40) +#define PCRE2_ERROR_DFA_UFUNC (-41) +#define PCRE2_ERROR_DFA_UITEM (-42) +#define PCRE2_ERROR_DFA_WSSIZE (-43) +#define PCRE2_ERROR_INTERNAL (-44) +#define PCRE2_ERROR_JIT_BADOPTION (-45) +#define PCRE2_ERROR_JIT_STACKLIMIT (-46) +#define PCRE2_ERROR_MATCHLIMIT (-47) +#define PCRE2_ERROR_NOMEMORY (-48) +#define PCRE2_ERROR_NOSUBSTRING (-49) +#define PCRE2_ERROR_NOUNIQUESUBSTRING (-50) +#define PCRE2_ERROR_NULL (-51) +#define PCRE2_ERROR_RECURSELOOP (-52) +#define PCRE2_ERROR_DEPTHLIMIT (-53) +#define PCRE2_ERROR_RECURSIONLIMIT (-53) /* Obsolete synonym */ +#define PCRE2_ERROR_UNAVAILABLE (-54) +#define PCRE2_ERROR_UNSET (-55) +#define PCRE2_ERROR_BADOFFSETLIMIT (-56) +#define PCRE2_ERROR_BADREPESCAPE (-57) +#define PCRE2_ERROR_REPMISSINGBRACE (-58) +#define PCRE2_ERROR_BADSUBSTITUTION (-59) +#define PCRE2_ERROR_BADSUBSPATTERN (-60) +#define PCRE2_ERROR_TOOMANYREPLACE (-61) +#define PCRE2_ERROR_BADSERIALIZEDDATA (-62) +#define PCRE2_ERROR_HEAPLIMIT (-63) +#define PCRE2_ERROR_CONVERT_SYNTAX (-64) +#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65) +#define PCRE2_ERROR_DFA_UINVALID_UTF (-66) + + +/* Request types for pcre2_pattern_info() */ + +#define PCRE2_INFO_ALLOPTIONS 0 +#define PCRE2_INFO_ARGOPTIONS 1 +#define PCRE2_INFO_BACKREFMAX 2 +#define PCRE2_INFO_BSR 3 +#define PCRE2_INFO_CAPTURECOUNT 4 +#define PCRE2_INFO_FIRSTCODEUNIT 5 +#define PCRE2_INFO_FIRSTCODETYPE 6 +#define PCRE2_INFO_FIRSTBITMAP 7 +#define PCRE2_INFO_HASCRORLF 8 +#define PCRE2_INFO_JCHANGED 9 +#define PCRE2_INFO_JITSIZE 10 +#define PCRE2_INFO_LASTCODEUNIT 11 +#define PCRE2_INFO_LASTCODETYPE 12 +#define PCRE2_INFO_MATCHEMPTY 13 +#define PCRE2_INFO_MATCHLIMIT 14 +#define PCRE2_INFO_MAXLOOKBEHIND 15 +#define PCRE2_INFO_MINLENGTH 16 +#define PCRE2_INFO_NAMECOUNT 17 +#define PCRE2_INFO_NAMEENTRYSIZE 18 +#define PCRE2_INFO_NAMETABLE 19 +#define PCRE2_INFO_NEWLINE 20 +#define PCRE2_INFO_DEPTHLIMIT 21 +#define PCRE2_INFO_RECURSIONLIMIT 21 /* Obsolete synonym */ +#define PCRE2_INFO_SIZE 22 +#define PCRE2_INFO_HASBACKSLASHC 23 +#define PCRE2_INFO_FRAMESIZE 24 +#define PCRE2_INFO_HEAPLIMIT 25 +#define PCRE2_INFO_EXTRAOPTIONS 26 + +/* Request types for pcre2_config(). */ + +#define PCRE2_CONFIG_BSR 0 +#define PCRE2_CONFIG_JIT 1 +#define PCRE2_CONFIG_JITTARGET 2 +#define PCRE2_CONFIG_LINKSIZE 3 +#define PCRE2_CONFIG_MATCHLIMIT 4 +#define PCRE2_CONFIG_NEWLINE 5 +#define PCRE2_CONFIG_PARENSLIMIT 6 +#define PCRE2_CONFIG_DEPTHLIMIT 7 +#define PCRE2_CONFIG_RECURSIONLIMIT 7 /* Obsolete synonym */ +#define PCRE2_CONFIG_STACKRECURSE 8 /* Obsolete */ +#define PCRE2_CONFIG_UNICODE 9 +#define PCRE2_CONFIG_UNICODE_VERSION 10 +#define PCRE2_CONFIG_VERSION 11 +#define PCRE2_CONFIG_HEAPLIMIT 12 +#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13 +#define PCRE2_CONFIG_COMPILED_WIDTHS 14 +#define PCRE2_CONFIG_TABLES_LENGTH 15 + + +/* Types for code units in patterns and subject strings. */ + +typedef uint8_t PCRE2_UCHAR8; +typedef uint16_t PCRE2_UCHAR16; +typedef uint32_t PCRE2_UCHAR32; + +typedef const PCRE2_UCHAR8 *PCRE2_SPTR8; +typedef const PCRE2_UCHAR16 *PCRE2_SPTR16; +typedef const PCRE2_UCHAR32 *PCRE2_SPTR32; + +/* The PCRE2_SIZE type is used for all string lengths and offsets in PCRE2, +including pattern offsets for errors and subject offsets after a match. We +define special values to indicate zero-terminated strings and unset offsets in +the offset vector (ovector). */ + +#define PCRE2_SIZE size_t +#define PCRE2_SIZE_MAX SIZE_MAX +#define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0) +#define PCRE2_UNSET (~(PCRE2_SIZE)0) + +/* Generic types for opaque structures and JIT callback functions. These +declarations are defined in a macro that is expanded for each width later. */ + +#define PCRE2_TYPES_LIST \ +struct pcre2_real_general_context; \ +typedef struct pcre2_real_general_context pcre2_general_context; \ +\ +struct pcre2_real_compile_context; \ +typedef struct pcre2_real_compile_context pcre2_compile_context; \ +\ +struct pcre2_real_match_context; \ +typedef struct pcre2_real_match_context pcre2_match_context; \ +\ +struct pcre2_real_convert_context; \ +typedef struct pcre2_real_convert_context pcre2_convert_context; \ +\ +struct pcre2_real_code; \ +typedef struct pcre2_real_code pcre2_code; \ +\ +struct pcre2_real_match_data; \ +typedef struct pcre2_real_match_data pcre2_match_data; \ +\ +struct pcre2_real_jit_stack; \ +typedef struct pcre2_real_jit_stack pcre2_jit_stack; \ +\ +typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *); + + +/* The structures for passing out data via callout functions. We use structures +so that new fields can be added on the end in future versions, without changing +the API of the function, thereby allowing old clients to work without +modification. Define the generic versions in a macro; the width-specific +versions are generated from this macro below. */ + +/* Flags for the callout_flags field. These are cleared after a callout. */ + +#define PCRE2_CALLOUT_STARTMATCH 0x00000001u /* Set for each bumpalong */ +#define PCRE2_CALLOUT_BACKTRACK 0x00000002u /* Set after a backtrack */ + +#define PCRE2_STRUCTURE_LIST \ +typedef struct pcre2_callout_block { \ + uint32_t version; /* Identifies version of block */ \ + /* ------------------------ Version 0 ------------------------------- */ \ + uint32_t callout_number; /* Number compiled into pattern */ \ + uint32_t capture_top; /* Max current capture */ \ + uint32_t capture_last; /* Most recently closed capture */ \ + PCRE2_SIZE *offset_vector; /* The offset vector */ \ + PCRE2_SPTR mark; /* Pointer to current mark or NULL */ \ + PCRE2_SPTR subject; /* The subject being matched */ \ + PCRE2_SIZE subject_length; /* The length of the subject */ \ + PCRE2_SIZE start_match; /* Offset to start of this match attempt */ \ + PCRE2_SIZE current_position; /* Where we currently are in the subject */ \ + PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ + PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ + /* ------------------- Added for Version 1 -------------------------- */ \ + PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ + PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \ + PCRE2_SPTR callout_string; /* String compiled into pattern */ \ + /* ------------------- Added for Version 2 -------------------------- */ \ + uint32_t callout_flags; /* See above for list */ \ + /* ------------------------------------------------------------------ */ \ +} pcre2_callout_block; \ +\ +typedef struct pcre2_callout_enumerate_block { \ + uint32_t version; /* Identifies version of block */ \ + /* ------------------------ Version 0 ------------------------------- */ \ + PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ + PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ + uint32_t callout_number; /* Number compiled into pattern */ \ + PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ + PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \ + PCRE2_SPTR callout_string; /* String compiled into pattern */ \ + /* ------------------------------------------------------------------ */ \ +} pcre2_callout_enumerate_block; \ +\ +typedef struct pcre2_substitute_callout_block { \ + uint32_t version; /* Identifies version of block */ \ + /* ------------------------ Version 0 ------------------------------- */ \ + PCRE2_SPTR input; /* Pointer to input subject string */ \ + PCRE2_SPTR output; /* Pointer to output buffer */ \ + PCRE2_SIZE output_offsets[2]; /* Changed portion of the output */ \ + PCRE2_SIZE *ovector; /* Pointer to current ovector */ \ + uint32_t oveccount; /* Count of pairs set in ovector */ \ + uint32_t subscount; /* Substitution number */ \ + /* ------------------------------------------------------------------ */ \ +} pcre2_substitute_callout_block; + + +/* List the generic forms of all other functions in macros, which will be +expanded for each width below. Start with functions that give general +information. */ + +#define PCRE2_GENERAL_INFO_FUNCTIONS \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *); + + +/* Functions for manipulating contexts. */ + +#define PCRE2_GENERAL_CONTEXT_FUNCTIONS \ +PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \ + pcre2_general_context_copy(pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \ + pcre2_general_context_create(void *(*)(size_t, void *), \ + void (*)(void *, void *), void *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_general_context_free(pcre2_general_context *); + +#define PCRE2_COMPILE_CONTEXT_FUNCTIONS \ +PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \ + pcre2_compile_context_copy(pcre2_compile_context *); \ +PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \ + pcre2_compile_context_create(pcre2_general_context *);\ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_compile_context_free(pcre2_compile_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_character_tables(pcre2_compile_context *, const uint8_t *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_newline(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ + int (*)(uint32_t, void *), void *); + +#define PCRE2_MATCH_CONTEXT_FUNCTIONS \ +PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \ + pcre2_match_context_copy(pcre2_match_context *); \ +PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \ + pcre2_match_context_create(pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_context_free(pcre2_match_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_callout(pcre2_match_context *, \ + int (*)(pcre2_callout_block *, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_substitute_callout(pcre2_match_context *, \ + int (*)(pcre2_substitute_callout_block *, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_heap_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_match_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_offset_limit(pcre2_match_context *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_recursion_memory_management(pcre2_match_context *, \ + void *(*)(size_t, void *), void (*)(void *, void *), void *); + +#define PCRE2_CONVERT_CONTEXT_FUNCTIONS \ +PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \ + pcre2_convert_context_copy(pcre2_convert_context *); \ +PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \ + pcre2_convert_context_create(pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_convert_context_free(pcre2_convert_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_glob_escape(pcre2_convert_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_glob_separator(pcre2_convert_context *, uint32_t); + + +/* Functions concerned with compiling a pattern to PCRE internal code. */ + +#define PCRE2_COMPILE_FUNCTIONS \ +PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \ + pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \ + pcre2_compile_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_code_free(pcre2_code *); \ +PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \ + pcre2_code_copy(const pcre2_code *); \ +PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \ + pcre2_code_copy_with_tables(const pcre2_code *); + + +/* Functions that give information about a compiled pattern. */ + +#define PCRE2_PATTERN_INFO_FUNCTIONS \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_pattern_info(const pcre2_code *, uint32_t, void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_callout_enumerate(const pcre2_code *, \ + int (*)(pcre2_callout_enumerate_block *, void *), void *); + + +/* Functions for running a match and inspecting the result. */ + +#define PCRE2_MATCH_FUNCTIONS \ +PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \ + pcre2_match_data_create(uint32_t, pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \ + pcre2_match_data_create_from_pattern(const pcre2_code *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_data_free(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \ + pcre2_get_mark(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ + pcre2_get_match_data_size(pcre2_match_data *); \ +PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \ + pcre2_get_ovector_count(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \ + pcre2_get_ovector_pointer(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ + pcre2_get_startchar(pcre2_match_data *); + + +/* Convenience functions for handling matched substrings. */ + +#define PCRE2_SUBSTRING_FUNCTIONS \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_copy_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR *, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_copy_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR *, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_substring_free(PCRE2_UCHAR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_get_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR **, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_get_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR **, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_length_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_length_bynumber(pcre2_match_data *, uint32_t, PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_nametable_scan(const pcre2_code *, PCRE2_SPTR, PCRE2_SPTR *, \ + PCRE2_SPTR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_substring_list_free(PCRE2_SPTR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); + +/* Functions for serializing / deserializing compiled patterns. */ + +#define PCRE2_SERIALIZE_FUNCTIONS \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_encode(const pcre2_code **, int32_t, uint8_t **, \ + PCRE2_SIZE *, pcre2_general_context *); \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_decode(pcre2_code **, int32_t, const uint8_t *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_get_number_of_codes(const uint8_t *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_serialize_free(uint8_t *); + + +/* Convenience function for match + substitute. */ + +#define PCRE2_SUBSTITUTE_FUNCTION \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substitute(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *, PCRE2_SPTR, \ + PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE *); + + +/* Functions for converting pattern source strings. */ + +#define PCRE2_CONVERT_FUNCTIONS \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_pattern_convert(PCRE2_SPTR, PCRE2_SIZE, uint32_t, PCRE2_UCHAR **, \ + PCRE2_SIZE *, pcre2_convert_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_converted_pattern_free(PCRE2_UCHAR *); + + +/* Functions for JIT processing */ + +#define PCRE2_JIT_FUNCTIONS \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_jit_compile(pcre2_code *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_jit_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_free_unused_memory(pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_jit_stack *PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_create(size_t, size_t, pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_free(pcre2_jit_stack *); + + +/* Other miscellaneous functions. */ + +#define PCRE2_OTHER_FUNCTIONS \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ +PCRE2_EXP_DECL const uint8_t *PCRE2_CALL_CONVENTION \ + pcre2_maketables(pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_maketables_free(pcre2_general_context *, const uint8_t *); + +/* Define macros that generate width-specific names from generic versions. The +three-level macro scheme is necessary to get the macros expanded when we want +them to be. First we get the width from PCRE2_LOCAL_WIDTH, which is used for +generating three versions of everything below. After that, PCRE2_SUFFIX will be +re-defined to use PCRE2_CODE_UNIT_WIDTH, for use when macros such as +pcre2_compile are called by application code. */ + +#define PCRE2_JOIN(a,b) a ## b +#define PCRE2_GLUE(a,b) PCRE2_JOIN(a,b) +#define PCRE2_SUFFIX(a) PCRE2_GLUE(a,PCRE2_LOCAL_WIDTH) + + +/* Data types */ + +#define PCRE2_UCHAR PCRE2_SUFFIX(PCRE2_UCHAR) +#define PCRE2_SPTR PCRE2_SUFFIX(PCRE2_SPTR) + +#define pcre2_code PCRE2_SUFFIX(pcre2_code_) +#define pcre2_jit_callback PCRE2_SUFFIX(pcre2_jit_callback_) +#define pcre2_jit_stack PCRE2_SUFFIX(pcre2_jit_stack_) + +#define pcre2_real_code PCRE2_SUFFIX(pcre2_real_code_) +#define pcre2_real_general_context PCRE2_SUFFIX(pcre2_real_general_context_) +#define pcre2_real_compile_context PCRE2_SUFFIX(pcre2_real_compile_context_) +#define pcre2_real_convert_context PCRE2_SUFFIX(pcre2_real_convert_context_) +#define pcre2_real_match_context PCRE2_SUFFIX(pcre2_real_match_context_) +#define pcre2_real_jit_stack PCRE2_SUFFIX(pcre2_real_jit_stack_) +#define pcre2_real_match_data PCRE2_SUFFIX(pcre2_real_match_data_) + + +/* Data blocks */ + +#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_) +#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_) +#define pcre2_substitute_callout_block PCRE2_SUFFIX(pcre2_substitute_callout_block_) +#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_) +#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_) +#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_) +#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_) +#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_) + + +/* Functions: the complete list in alphabetical order */ + +#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_) +#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_) +#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_) +#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_) +#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_) +#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_) +#define pcre2_compile_context_create PCRE2_SUFFIX(pcre2_compile_context_create_) +#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_) +#define pcre2_config PCRE2_SUFFIX(pcre2_config_) +#define pcre2_convert_context_copy PCRE2_SUFFIX(pcre2_convert_context_copy_) +#define pcre2_convert_context_create PCRE2_SUFFIX(pcre2_convert_context_create_) +#define pcre2_convert_context_free PCRE2_SUFFIX(pcre2_convert_context_free_) +#define pcre2_converted_pattern_free PCRE2_SUFFIX(pcre2_converted_pattern_free_) +#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_) +#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_) +#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_) +#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_) +#define pcre2_get_error_message PCRE2_SUFFIX(pcre2_get_error_message_) +#define pcre2_get_mark PCRE2_SUFFIX(pcre2_get_mark_) +#define pcre2_get_match_data_size PCRE2_SUFFIX(pcre2_get_match_data_size_) +#define pcre2_get_ovector_pointer PCRE2_SUFFIX(pcre2_get_ovector_pointer_) +#define pcre2_get_ovector_count PCRE2_SUFFIX(pcre2_get_ovector_count_) +#define pcre2_get_startchar PCRE2_SUFFIX(pcre2_get_startchar_) +#define pcre2_jit_compile PCRE2_SUFFIX(pcre2_jit_compile_) +#define pcre2_jit_match PCRE2_SUFFIX(pcre2_jit_match_) +#define pcre2_jit_free_unused_memory PCRE2_SUFFIX(pcre2_jit_free_unused_memory_) +#define pcre2_jit_stack_assign PCRE2_SUFFIX(pcre2_jit_stack_assign_) +#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_) +#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_) +#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_) +#define pcre2_maketables_free PCRE2_SUFFIX(pcre2_maketables_free_) +#define pcre2_match PCRE2_SUFFIX(pcre2_match_) +#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_) +#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_) +#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_) +#define pcre2_match_data_create PCRE2_SUFFIX(pcre2_match_data_create_) +#define pcre2_match_data_create_from_pattern PCRE2_SUFFIX(pcre2_match_data_create_from_pattern_) +#define pcre2_match_data_free PCRE2_SUFFIX(pcre2_match_data_free_) +#define pcre2_pattern_convert PCRE2_SUFFIX(pcre2_pattern_convert_) +#define pcre2_pattern_info PCRE2_SUFFIX(pcre2_pattern_info_) +#define pcre2_serialize_decode PCRE2_SUFFIX(pcre2_serialize_decode_) +#define pcre2_serialize_encode PCRE2_SUFFIX(pcre2_serialize_encode_) +#define pcre2_serialize_free PCRE2_SUFFIX(pcre2_serialize_free_) +#define pcre2_serialize_get_number_of_codes PCRE2_SUFFIX(pcre2_serialize_get_number_of_codes_) +#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_) +#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_) +#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_) +#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_) +#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_) +#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_) +#define pcre2_set_glob_escape PCRE2_SUFFIX(pcre2_set_glob_escape_) +#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_) +#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_) +#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_) +#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_) +#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) +#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) +#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) +#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) +#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) +#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) +#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_) +#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_) +#define pcre2_substring_get_byname PCRE2_SUFFIX(pcre2_substring_get_byname_) +#define pcre2_substring_get_bynumber PCRE2_SUFFIX(pcre2_substring_get_bynumber_) +#define pcre2_substring_length_byname PCRE2_SUFFIX(pcre2_substring_length_byname_) +#define pcre2_substring_length_bynumber PCRE2_SUFFIX(pcre2_substring_length_bynumber_) +#define pcre2_substring_list_get PCRE2_SUFFIX(pcre2_substring_list_get_) +#define pcre2_substring_list_free PCRE2_SUFFIX(pcre2_substring_list_free_) +#define pcre2_substring_nametable_scan PCRE2_SUFFIX(pcre2_substring_nametable_scan_) +#define pcre2_substring_number_from_name PCRE2_SUFFIX(pcre2_substring_number_from_name_) + +/* Keep this old function name for backwards compatibility */ +#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_) + +/* Keep this obsolete function for backwards compatibility: it is now a noop. */ +#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_) + +/* Now generate all three sets of width-specific structures and function +prototypes. */ + +#define PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS \ +PCRE2_TYPES_LIST \ +PCRE2_STRUCTURE_LIST \ +PCRE2_GENERAL_INFO_FUNCTIONS \ +PCRE2_GENERAL_CONTEXT_FUNCTIONS \ +PCRE2_COMPILE_CONTEXT_FUNCTIONS \ +PCRE2_CONVERT_CONTEXT_FUNCTIONS \ +PCRE2_CONVERT_FUNCTIONS \ +PCRE2_MATCH_CONTEXT_FUNCTIONS \ +PCRE2_COMPILE_FUNCTIONS \ +PCRE2_PATTERN_INFO_FUNCTIONS \ +PCRE2_MATCH_FUNCTIONS \ +PCRE2_SUBSTRING_FUNCTIONS \ +PCRE2_SERIALIZE_FUNCTIONS \ +PCRE2_SUBSTITUTE_FUNCTION \ +PCRE2_JIT_FUNCTIONS \ +PCRE2_OTHER_FUNCTIONS + +#define PCRE2_LOCAL_WIDTH 8 +PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS +#undef PCRE2_LOCAL_WIDTH + +#define PCRE2_LOCAL_WIDTH 16 +PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS +#undef PCRE2_LOCAL_WIDTH + +#define PCRE2_LOCAL_WIDTH 32 +PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS +#undef PCRE2_LOCAL_WIDTH + +/* Undefine the list macros; they are no longer needed. */ + +#undef PCRE2_TYPES_LIST +#undef PCRE2_STRUCTURE_LIST +#undef PCRE2_GENERAL_INFO_FUNCTIONS +#undef PCRE2_GENERAL_CONTEXT_FUNCTIONS +#undef PCRE2_COMPILE_CONTEXT_FUNCTIONS +#undef PCRE2_CONVERT_CONTEXT_FUNCTIONS +#undef PCRE2_MATCH_CONTEXT_FUNCTIONS +#undef PCRE2_COMPILE_FUNCTIONS +#undef PCRE2_PATTERN_INFO_FUNCTIONS +#undef PCRE2_MATCH_FUNCTIONS +#undef PCRE2_SUBSTRING_FUNCTIONS +#undef PCRE2_SERIALIZE_FUNCTIONS +#undef PCRE2_SUBSTITUTE_FUNCTION +#undef PCRE2_JIT_FUNCTIONS +#undef PCRE2_OTHER_FUNCTIONS +#undef PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS + +/* PCRE2_CODE_UNIT_WIDTH must be defined. If it is 8, 16, or 32, redefine +PCRE2_SUFFIX to use it. If it is 0, undefine the other macros and make +PCRE2_SUFFIX a no-op. Otherwise, generate an error. */ + +#undef PCRE2_SUFFIX +#ifndef PCRE2_CODE_UNIT_WIDTH +#error PCRE2_CODE_UNIT_WIDTH must be defined before including pcre2.h. +#error Use 8, 16, or 32; or 0 for a multi-width application. +#else /* PCRE2_CODE_UNIT_WIDTH is defined */ +#if PCRE2_CODE_UNIT_WIDTH == 8 || \ + PCRE2_CODE_UNIT_WIDTH == 16 || \ + PCRE2_CODE_UNIT_WIDTH == 32 +#define PCRE2_SUFFIX(a) PCRE2_GLUE(a, PCRE2_CODE_UNIT_WIDTH) +#elif PCRE2_CODE_UNIT_WIDTH == 0 +#undef PCRE2_JOIN +#undef PCRE2_GLUE +#define PCRE2_SUFFIX(a) a +#else +#error PCRE2_CODE_UNIT_WIDTH must be 0, 8, 16, or 32. +#endif +#endif /* PCRE2_CODE_UNIT_WIDTH is defined */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* PCRE2_H_IDEMPOTENT_GUARD */ + +/* End of pcre2.h */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_auto_possess.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_auto_possess.c new file mode 100644 index 0000000000..cbd1903e56 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_auto_possess.c @@ -0,0 +1,1365 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains functions that scan a compiled pattern and change +repeats into possessive repeats where possible. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + + +#include "regexp/pcre2/pcre2_internal.h" + + +/************************************************* +* Tables for auto-possessification * +*************************************************/ + +/* This table is used to check whether auto-possessification is possible +between adjacent character-type opcodes. The left-hand (repeated) opcode is +used to select the row, and the right-hand opcode is use to select the column. +A value of 1 means that auto-possessification is OK. For example, the second +value in the first row means that \D+\d can be turned into \D++\d. + +The Unicode property types (\P and \p) have to be present to fill out the table +because of what their opcode values are, but the table values should always be +zero because property types are handled separately in the code. The last four +columns apply to items that cannot be repeated, so there is no need to have +rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is +*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ + +#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) +#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) + +static const uint8_t autoposstab[APTROWS][APTCOLS] = { +/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ + { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ + { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ + { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ + { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ + { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ + { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ + { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ + { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ +}; + +#ifdef SUPPORT_UNICODE +/* This table is used to check whether auto-possessification is possible +between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The +left-hand (repeated) opcode is used to select the row, and the right-hand +opcode is used to select the column. The values are as follows: + + 0 Always return FALSE (never auto-possessify) + 1 Character groups are distinct (possessify if both are OP_PROP) + 2 Check character categories in the same group (general or particular) + 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) + + 4 Check left general category vs right particular category + 5 Check right general category vs left particular category + + 6 Left alphanum vs right general category + 7 Left space vs right general category + 8 Left word vs right general category + + 9 Right alphanum vs left general category + 10 Right space vs left general category + 11 Right word vs left general category + + 12 Left alphanum vs right particular category + 13 Left space vs right particular category + 14 Left word vs right particular category + + 15 Right alphanum vs left particular category + 16 Right space vs left particular category + 17 Right word vs left particular category +*/ + +static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { +/* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */ + { 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */ + { 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */ + { 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */ + { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ + { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */ + { 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */ + { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */ + { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */ + { 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */ +}; + +/* This table is used to check whether auto-possessification is possible +between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one +specifies a general category and the other specifies a particular category. The +row is selected by the general category and the column by the particular +category. The value is 1 if the particular category is not part of the general +category. */ + +static const uint8_t catposstab[7][30] = { +/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ + { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ + { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ +}; + +/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against +a general or particular category. The properties in each row are those +that apply to the character set in question. Duplication means that a little +unnecessary work is done when checking, but this keeps things much simpler +because they can all use the same code. For more details see the comment where +this table is used. + +Note: SPACE and PXSPACE used to be different because Perl excluded VT from +"space", but from Perl 5.18 it's included, so both categories are treated the +same here. */ + +static const uint8_t posspropstab[3][4] = { + { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ + { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ + { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ +}; +#endif /* SUPPORT_UNICODE */ + + + +#ifdef SUPPORT_UNICODE +/************************************************* +* Check a character and a property * +*************************************************/ + +/* This function is called by compare_opcodes() when a property item is +adjacent to a fixed character. + +Arguments: + c the character + ptype the property type + pdata the data for the type + negated TRUE if it's a negated property (\P or \p{^) + +Returns: TRUE if auto-possessifying is OK +*/ + +static BOOL +check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, + BOOL negated) +{ +BOOL ok; +const uint32_t *p; +const ucd_record *prop = GET_UCD(c); + +switch(ptype) + { + case PT_LAMP: + return (prop->chartype == ucp_Lu || + prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == negated; + + case PT_GC: + return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; + + case PT_PC: + return (pdata == prop->chartype) == negated; + + case PT_SC: + return (pdata == prop->script) == negated; + + case PT_SCX: + ok = (pdata == prop->script + || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); + return ok == negated; + + /* These are specials */ + + case PT_ALNUM: + return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, which + means that Perl space and POSIX space are now identical. PCRE was changed + at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + return negated; + + default: + return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; + } + break; /* Control never reaches here */ + + case PT_WORD: + return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE) == negated; + + case PT_CLIST: + p = PRIV(ucd_caseless_sets) + prop->caseset; + for (;;) + { + if (c < *p) return !negated; + if (c == *p++) return negated; + } + break; /* Control never reaches here */ + + /* Haven't yet thought these through. */ + + case PT_BIDICL: + return FALSE; + + case PT_BOOL: + return FALSE; + } + +return FALSE; +} +#endif /* SUPPORT_UNICODE */ + + + +/************************************************* +* Base opcode of repeated opcodes * +*************************************************/ + +/* Returns the base opcode for repeated single character type opcodes. If the +opcode is not a repeated character type, it returns with the original value. + +Arguments: c opcode +Returns: base opcode for the type +*/ + +static PCRE2_UCHAR +get_repeat_base(PCRE2_UCHAR c) +{ +return (c > OP_TYPEPOSUPTO)? c : + (c >= OP_TYPESTAR)? OP_TYPESTAR : + (c >= OP_NOTSTARI)? OP_NOTSTARI : + (c >= OP_NOTSTAR)? OP_NOTSTAR : + (c >= OP_STARI)? OP_STARI : + OP_STAR; +} + + +/************************************************* +* Fill the character property list * +*************************************************/ + +/* Checks whether the code points to an opcode that can take part in auto- +possessification, and if so, fills a list with its properties. + +Arguments: + code points to start of expression + utf TRUE if in UTF mode + ucp TRUE if in UCP mode + fcc points to the case-flipping table + list points to output list + list[0] will be filled with the opcode + list[1] will be non-zero if this opcode + can match an empty character string + list[2..7] depends on the opcode + +Returns: points to the start of the next opcode if *code is accepted + NULL if *code is not accepted +*/ + +static PCRE2_SPTR +get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, + uint32_t *list) +{ +PCRE2_UCHAR c = *code; +PCRE2_UCHAR base; +PCRE2_SPTR end; +uint32_t chr; + +#ifdef SUPPORT_UNICODE +uint32_t *clist_dest; +const uint32_t *clist_src; +#else +(void)utf; /* Suppress "unused parameter" compiler warnings */ +(void)ucp; +#endif + +list[0] = c; +list[1] = FALSE; +code++; + +if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) + { + base = get_repeat_base(c); + c -= (base - OP_STAR); + + if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) + code += IMM2_SIZE; + + list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && + c != OP_POSPLUS); + + switch(base) + { + case OP_STAR: + list[0] = OP_CHAR; + break; + + case OP_STARI: + list[0] = OP_CHARI; + break; + + case OP_NOTSTAR: + list[0] = OP_NOT; + break; + + case OP_NOTSTARI: + list[0] = OP_NOTI; + break; + + case OP_TYPESTAR: + list[0] = *code; + code++; + break; + } + c = list[0]; + } + +switch(c) + { + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + case OP_ALLANY: + case OP_ANYNL: + case OP_NOT_HSPACE: + case OP_HSPACE: + case OP_NOT_VSPACE: + case OP_VSPACE: + case OP_EXTUNI: + case OP_EODN: + case OP_EOD: + case OP_DOLL: + case OP_DOLLM: + return code; + + case OP_CHAR: + case OP_NOT: + GETCHARINCTEST(chr, code); + list[2] = chr; + list[3] = NOTACHAR; + return code; + + case OP_CHARI: + case OP_NOTI: + list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; + GETCHARINCTEST(chr, code); + list[2] = chr; + +#ifdef SUPPORT_UNICODE + if (chr < 128 || (chr < 256 && !utf && !ucp)) + list[3] = fcc[chr]; + else + list[3] = UCD_OTHERCASE(chr); +#elif defined SUPPORT_WIDE_CHARS + list[3] = (chr < 256) ? fcc[chr] : chr; +#else + list[3] = fcc[chr]; +#endif + + /* The othercase might be the same value. */ + + if (chr == list[3]) + list[3] = NOTACHAR; + else + list[4] = NOTACHAR; + return code; + +#ifdef SUPPORT_UNICODE + case OP_PROP: + case OP_NOTPROP: + if (code[0] != PT_CLIST) + { + list[2] = code[0]; + list[3] = code[1]; + return code + 2; + } + + /* Convert only if we have enough space. */ + + clist_src = PRIV(ucd_caseless_sets) + code[1]; + clist_dest = list + 2; + code += 2; + + do { + if (clist_dest >= list + 8) + { + /* Early return if there is not enough space. This should never + happen, since all clists are shorter than 5 character now. */ + list[2] = code[0]; + list[3] = code[1]; + return code; + } + *clist_dest++ = *clist_src; + } + while(*clist_src++ != NOTACHAR); + + /* All characters are stored. The terminating NOTACHAR is copied from the + clist itself. */ + + list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; + return code; +#endif + + case OP_NCLASS: + case OP_CLASS: +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + if (c == OP_XCLASS) + end = code + GET(code, 0) - 1; + else +#endif + end = code + 32 / sizeof(PCRE2_UCHAR); + + switch(*end) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: + list[1] = TRUE; + end++; + break; + + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSPLUS: + end++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + list[1] = (GET2(end, 1) == 0); + end += 1 + 2 * IMM2_SIZE; + break; + } + list[2] = (uint32_t)(end - code); + return end; + } + +return NULL; /* Opcode not accepted */ +} + + + +/************************************************* +* Scan further character sets for match * +*************************************************/ + +/* Checks whether the base and the current opcode have a common character, in +which case the base cannot be possessified. + +Arguments: + code points to the byte code + utf TRUE in UTF mode + ucp TRUE in UCP mode + cb compile data block + base_list the data list of the base opcode + base_end the end of the base opcode + rec_limit points to recursion depth counter + +Returns: TRUE if the auto-possessification is possible +*/ + +static BOOL +compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, + const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) +{ +PCRE2_UCHAR c; +uint32_t list[8]; +const uint32_t *chr_ptr; +const uint32_t *ochr_ptr; +const uint32_t *list_ptr; +PCRE2_SPTR next_code; +#ifdef SUPPORT_WIDE_CHARS +PCRE2_SPTR xclass_flags; +#endif +const uint8_t *class_bitset; +const uint8_t *set1, *set2, *set_end; +uint32_t chr; +BOOL accepted, invert_bits; +BOOL entered_a_group = FALSE; + +if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ + +/* Note: the base_list[1] contains whether the current opcode has a greedy +(represented by a non-zero value) quantifier. This is a different from +other character type lists, which store here that the character iterator +matches to an empty string (also represented by a non-zero value). */ + +for(;;) + { + /* All operations move the code pointer forward. + Therefore infinite recursions are not possible. */ + + c = *code; + + /* Skip over callouts */ + + if (c == OP_CALLOUT) + { + code += PRIV(OP_lengths)[c]; + continue; + } + + if (c == OP_CALLOUT_STR) + { + code += GET(code, 1 + 2*LINK_SIZE); + continue; + } + + /* At the end of a branch, skip to the end of the group. */ + + if (c == OP_ALT) + { + do code += GET(code, 1); while (*code == OP_ALT); + c = *code; + } + + /* Inspect the next opcode. */ + + switch(c) + { + /* We can always possessify a greedy iterator at the end of the pattern, + which is reached after skipping over the final OP_KET. A non-greedy + iterator must never be possessified. */ + + case OP_END: + return base_list[1] != 0; + + /* When an iterator is at the end of certain kinds of group we can inspect + what follows the group by skipping over the closing ket. Note that this + does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given + iteration is variable (could be another iteration or could be the next + item). As these two opcodes are not listed in the next switch, they will + end up as the next code to inspect, and return FALSE by virtue of being + unsupported. */ + + case OP_KET: + case OP_KETRPOS: + /* The non-greedy case cannot be converted to a possessive form. */ + + if (base_list[1] == 0) return FALSE; + + /* If the bracket is capturing it might be referenced by an OP_RECURSE + so its last iterator can never be possessified if the pattern contains + recursions. (This could be improved by keeping a list of group numbers that + are called by recursion.) */ + + switch(*(code - GET(code, 1))) + { + case OP_CBRA: + case OP_SCBRA: + case OP_CBRAPOS: + case OP_SCBRAPOS: + if (cb->had_recurse) return FALSE; + break; + + /* A script run might have to backtrack if the iterated item can match + characters from more than one script. So give up unless repeating an + explicit character. */ + + case OP_SCRIPT_RUN: + if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) + return FALSE; + break; + + /* Atomic sub-patterns and assertions can always auto-possessify their + last iterator. However, if the group was entered as a result of checking + a previous iterator, this is not possible. */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + case OP_ONCE: + return !entered_a_group; + + /* Non-atomic assertions - don't possessify last iterator. This needs + more thought. */ + + case OP_ASSERT_NA: + case OP_ASSERTBACK_NA: + return FALSE; + } + + /* Skip over the bracket and inspect what comes next. */ + + code += PRIV(OP_lengths)[c]; + continue; + + /* Handle cases where the next item is a group. */ + + case OP_ONCE: + case OP_BRA: + case OP_CBRA: + next_code = code + GET(code, 1); + code += PRIV(OP_lengths)[c]; + + /* Check each branch. We have to recurse a level for all but the last + branch. */ + + while (*next_code == OP_ALT) + { + if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) + return FALSE; + code = next_code + 1 + LINK_SIZE; + next_code += GET(next_code, 1); + } + + entered_a_group = TRUE; + continue; + + case OP_BRAZERO: + case OP_BRAMINZERO: + + next_code = code + 1; + if (*next_code != OP_BRA && *next_code != OP_CBRA && + *next_code != OP_ONCE) return FALSE; + + do next_code += GET(next_code, 1); while (*next_code == OP_ALT); + + /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ + + next_code += 1 + LINK_SIZE; + if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, + rec_limit)) + return FALSE; + + code += PRIV(OP_lengths)[c]; + continue; + + /* The next opcode does not need special handling; fall through and use it + to see if the base can be possessified. */ + + default: + break; + } + + /* We now have the next appropriate opcode to compare with the base. Check + for a supported opcode, and load its properties. */ + + code = get_chr_property_list(code, utf, ucp, cb->fcc, list); + if (code == NULL) return FALSE; /* Unsupported */ + + /* If either opcode is a small character list, set pointers for comparing + characters from that list with another list, or with a property. */ + + if (base_list[0] == OP_CHAR) + { + chr_ptr = base_list + 2; + list_ptr = list; + } + else if (list[0] == OP_CHAR) + { + chr_ptr = list + 2; + list_ptr = base_list; + } + + /* Character bitsets can also be compared to certain opcodes. */ + + else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS +#if PCRE2_CODE_UNIT_WIDTH == 8 + /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ + || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) +#endif + ) + { +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) +#else + if (base_list[0] == OP_CLASS) +#endif + { + set1 = (uint8_t *)(base_end - base_list[2]); + list_ptr = list; + } + else + { + set1 = (uint8_t *)(code - list[2]); + list_ptr = base_list; + } + + invert_bits = FALSE; + switch(list_ptr[0]) + { + case OP_CLASS: + case OP_NCLASS: + set2 = (uint8_t *) + ((list_ptr == list ? code : base_end) - list_ptr[2]); + break; + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; + if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; + if ((*xclass_flags & XCL_MAP) == 0) + { + /* No bits are set for characters < 256. */ + if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; + /* Might be an empty repeat. */ + continue; + } + set2 = (uint8_t *)(xclass_flags + 1); + break; +#endif + + case OP_NOT_DIGIT: + invert_bits = TRUE; + /* Fall through */ + case OP_DIGIT: + set2 = (uint8_t *)(cb->cbits + cbit_digit); + break; + + case OP_NOT_WHITESPACE: + invert_bits = TRUE; + /* Fall through */ + case OP_WHITESPACE: + set2 = (uint8_t *)(cb->cbits + cbit_space); + break; + + case OP_NOT_WORDCHAR: + invert_bits = TRUE; + /* Fall through */ + case OP_WORDCHAR: + set2 = (uint8_t *)(cb->cbits + cbit_word); + break; + + default: + return FALSE; + } + + /* Because the bit sets are unaligned bytes, we need to perform byte + comparison here. */ + + set_end = set1 + 32; + if (invert_bits) + { + do + { + if ((*set1++ & ~(*set2++)) != 0) return FALSE; + } + while (set1 < set_end); + } + else + { + do + { + if ((*set1++ & *set2++) != 0) return FALSE; + } + while (set1 < set_end); + } + + if (list[1] == 0) return TRUE; + /* Might be an empty repeat. */ + continue; + } + + /* Some property combinations also acceptable. Unicode property opcodes are + processed specially; the rest can be handled with a lookup table. */ + + else + { + uint32_t leftop, rightop; + + leftop = base_list[0]; + rightop = list[0]; + +#ifdef SUPPORT_UNICODE + accepted = FALSE; /* Always set in non-unicode case. */ + if (leftop == OP_PROP || leftop == OP_NOTPROP) + { + if (rightop == OP_EOD) + accepted = TRUE; + else if (rightop == OP_PROP || rightop == OP_NOTPROP) + { + int n; + const uint8_t *p; + BOOL same = leftop == rightop; + BOOL lisprop = leftop == OP_PROP; + BOOL risprop = rightop == OP_PROP; + BOOL bothprop = lisprop && risprop; + + /* There's a table that specifies how each combination is to be + processed: + 0 Always return FALSE (never auto-possessify) + 1 Character groups are distinct (possessify if both are OP_PROP) + 2 Check character categories in the same group (general or particular) + 3 Return TRUE if the two opcodes are not the same + ... see comments below + */ + + n = propposstab[base_list[2]][list[2]]; + switch(n) + { + case 0: break; + case 1: accepted = bothprop; break; + case 2: accepted = (base_list[3] == list[3]) != same; break; + case 3: accepted = !same; break; + + case 4: /* Left general category, right particular category */ + accepted = risprop && catposstab[base_list[3]][list[3]] == same; + break; + + case 5: /* Right general category, left particular category */ + accepted = lisprop && catposstab[list[3]][base_list[3]] == same; + break; + + /* This code is logically tricky. Think hard before fiddling with it. + The posspropstab table has four entries per row. Each row relates to + one of PCRE's special properties such as ALNUM or SPACE or WORD. + Only WORD actually needs all four entries, but using repeats for the + others means they can all use the same code below. + + The first two entries in each row are Unicode general categories, and + apply always, because all the characters they include are part of the + PCRE character set. The third and fourth entries are a general and a + particular category, respectively, that include one or more relevant + characters. One or the other is used, depending on whether the check + is for a general or a particular category. However, in both cases the + category contains more characters than the specials that are defined + for the property being tested against. Therefore, it cannot be used + in a NOTPROP case. + + Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. + Underscore is covered by ucp_P or ucp_Po. */ + + case 6: /* Left alphanum vs right general category */ + case 7: /* Left space vs right general category */ + case 8: /* Left word vs right general category */ + p = posspropstab[n-6]; + accepted = risprop && lisprop == + (list[3] != p[0] && + list[3] != p[1] && + (list[3] != p[2] || !lisprop)); + break; + + case 9: /* Right alphanum vs left general category */ + case 10: /* Right space vs left general category */ + case 11: /* Right word vs left general category */ + p = posspropstab[n-9]; + accepted = lisprop && risprop == + (base_list[3] != p[0] && + base_list[3] != p[1] && + (base_list[3] != p[2] || !risprop)); + break; + + case 12: /* Left alphanum vs right particular category */ + case 13: /* Left space vs right particular category */ + case 14: /* Left word vs right particular category */ + p = posspropstab[n-12]; + accepted = risprop && lisprop == + (catposstab[p[0]][list[3]] && + catposstab[p[1]][list[3]] && + (list[3] != p[3] || !lisprop)); + break; + + case 15: /* Right alphanum vs left particular category */ + case 16: /* Right space vs left particular category */ + case 17: /* Right word vs left particular category */ + p = posspropstab[n-15]; + accepted = lisprop && risprop == + (catposstab[p[0]][base_list[3]] && + catposstab[p[1]][base_list[3]] && + (base_list[3] != p[3] || !risprop)); + break; + } + } + } + + else +#endif /* SUPPORT_UNICODE */ + + accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && + rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && + autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; + + if (!accepted) return FALSE; + + if (list[1] == 0) return TRUE; + /* Might be an empty repeat. */ + continue; + } + + /* Control reaches here only if one of the items is a small character list. + All characters are checked against the other side. */ + + do + { + chr = *chr_ptr; + + switch(list_ptr[0]) + { + case OP_CHAR: + ochr_ptr = list_ptr + 2; + do + { + if (chr == *ochr_ptr) return FALSE; + ochr_ptr++; + } + while(*ochr_ptr != NOTACHAR); + break; + + case OP_NOT: + ochr_ptr = list_ptr + 2; + do + { + if (chr == *ochr_ptr) + break; + ochr_ptr++; + } + while(*ochr_ptr != NOTACHAR); + if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ + break; + + /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* + set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ + + case OP_DIGIT: + if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; + break; + + case OP_NOT_DIGIT: + if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; + break; + + case OP_WHITESPACE: + if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; + break; + + case OP_NOT_WHITESPACE: + if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; + break; + + case OP_WORDCHAR: + if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; + break; + + case OP_NOT_WORDCHAR: + if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; + break; + + case OP_HSPACE: + switch(chr) + { + HSPACE_CASES: return FALSE; + default: break; + } + break; + + case OP_NOT_HSPACE: + switch(chr) + { + HSPACE_CASES: break; + default: return FALSE; + } + break; + + case OP_ANYNL: + case OP_VSPACE: + switch(chr) + { + VSPACE_CASES: return FALSE; + default: break; + } + break; + + case OP_NOT_VSPACE: + switch(chr) + { + VSPACE_CASES: break; + default: return FALSE; + } + break; + + case OP_DOLL: + case OP_EODN: + switch (chr) + { + case CHAR_CR: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + return FALSE; + } + break; + + case OP_EOD: /* Can always possessify before \z */ + break; + +#ifdef SUPPORT_UNICODE + case OP_PROP: + case OP_NOTPROP: + if (!check_char_prop(chr, list_ptr[2], list_ptr[3], + list_ptr[0] == OP_NOTPROP)) + return FALSE; + break; +#endif + + case OP_NCLASS: + if (chr > 255) return FALSE; + /* Fall through */ + + case OP_CLASS: + if (chr > 255) break; + class_bitset = (uint8_t *) + ((list_ptr == list ? code : base_end) - list_ptr[2]); + if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; + break; + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - + list_ptr[2] + LINK_SIZE, utf)) return FALSE; + break; +#endif + + default: + return FALSE; + } + + chr_ptr++; + } + while(*chr_ptr != NOTACHAR); + + /* At least one character must be matched from this opcode. */ + + if (list[1] == 0) return TRUE; + } + +/* Control never reaches here. There used to be a fail-save return FALSE; here, +but some compilers complain about an unreachable statement. */ +} + + + +/************************************************* +* Scan compiled regex for auto-possession * +*************************************************/ + +/* Replaces single character iterations with their possessive alternatives +if appropriate. This function modifies the compiled opcode! Hitting a +non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a +bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches +overly complicated or large patterns. In these cases, the check just stops, +leaving the remainder of the pattern unpossessified. + +Arguments: + code points to start of the byte code + cb compile data block + +Returns: 0 for success + -1 if a non-existant opcode is encountered +*/ + +int +PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) +{ +PCRE2_UCHAR c; +PCRE2_SPTR end; +PCRE2_UCHAR *repeat_opcode; +uint32_t list[8]; +int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ +BOOL utf = (cb->external_options & PCRE2_UTF) != 0; +BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; + +for (;;) + { + c = *code; + + if (c >= OP_TABLE_LENGTH) return -1; /* Something gone wrong */ + + if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) + { + c -= get_repeat_base(c) - OP_STAR; + end = (c <= OP_MINUPTO) ? + get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; + list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; + + if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, + &rec_limit)) + { + switch(c) + { + case OP_STAR: + *code += OP_POSSTAR - OP_STAR; + break; + + case OP_MINSTAR: + *code += OP_POSSTAR - OP_MINSTAR; + break; + + case OP_PLUS: + *code += OP_POSPLUS - OP_PLUS; + break; + + case OP_MINPLUS: + *code += OP_POSPLUS - OP_MINPLUS; + break; + + case OP_QUERY: + *code += OP_POSQUERY - OP_QUERY; + break; + + case OP_MINQUERY: + *code += OP_POSQUERY - OP_MINQUERY; + break; + + case OP_UPTO: + *code += OP_POSUPTO - OP_UPTO; + break; + + case OP_MINUPTO: + *code += OP_POSUPTO - OP_MINUPTO; + break; + } + } + c = *code; + } + else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) + { +#ifdef SUPPORT_WIDE_CHARS + if (c == OP_XCLASS) + repeat_opcode = code + GET(code, 1); + else +#endif + repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); + + c = *repeat_opcode; + if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) + { + /* The return from get_chr_property_list() will never be NULL when + *code (aka c) is one of the three class opcodes. However, gcc with + -fanalyzer notes that a NULL return is possible, and grumbles. Hence we + put in a check. */ + + end = get_chr_property_list(code, utf, ucp, cb->fcc, list); + list[1] = (c & 1) == 0; + + if (end != NULL && + compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) + { + switch (c) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + *repeat_opcode = OP_CRPOSSTAR; + break; + + case OP_CRPLUS: + case OP_CRMINPLUS: + *repeat_opcode = OP_CRPOSPLUS; + break; + + case OP_CRQUERY: + case OP_CRMINQUERY: + *repeat_opcode = OP_CRPOSQUERY; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + *repeat_opcode = OP_CRPOSRANGE; + break; + } + } + } + c = *code; + } + + switch(c) + { + case OP_END: + return 0; + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) + code += 2; + break; + + case OP_CALLOUT_STR: + code += GET(code, 1 + 2*LINK_SIZE); + break; + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + code += GET(code, 1); + break; +#endif + + case OP_MARK: + case OP_COMMIT_ARG: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; + } + + /* Add in the fixed length from the table */ + + code += PRIV(OP_lengths)[c]; + + /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be + followed by a multi-byte character. The length in the table is a minimum, so + we have to arrange to skip the extra code units. */ + +#ifdef MAYBE_UTF_MULTI + if (utf) switch(c) + { + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_STAR: + case OP_MINSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_QUERY: + case OP_MINQUERY: + case OP_UPTO: + case OP_MINUPTO: + case OP_EXACT: + case OP_POSSTAR: + case OP_POSPLUS: + case OP_POSQUERY: + case OP_POSUPTO: + case OP_STARI: + case OP_MINSTARI: + case OP_PLUSI: + case OP_MINPLUSI: + case OP_QUERYI: + case OP_MINQUERYI: + case OP_UPTOI: + case OP_MINUPTOI: + case OP_EXACTI: + case OP_POSSTARI: + case OP_POSPLUSI: + case OP_POSQUERYI: + case OP_POSUPTOI: + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTQUERY: + case OP_NOTMINQUERY: + case OP_NOTUPTO: + case OP_NOTMINUPTO: + case OP_NOTEXACT: + case OP_NOTPOSSTAR: + case OP_NOTPOSPLUS: + case OP_NOTPOSQUERY: + case OP_NOTPOSUPTO: + case OP_NOTSTARI: + case OP_NOTMINSTARI: + case OP_NOTPLUSI: + case OP_NOTMINPLUSI: + case OP_NOTQUERYI: + case OP_NOTMINQUERYI: + case OP_NOTUPTOI: + case OP_NOTMINUPTOI: + case OP_NOTEXACTI: + case OP_NOTPOSSTARI: + case OP_NOTPOSPLUSI: + case OP_NOTPOSQUERYI: + case OP_NOTPOSUPTOI: + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); + break; + } +#else + (void)(utf); /* Keep compiler happy by referencing function argument */ +#endif /* SUPPORT_WIDE_CHARS */ + } +} + +/* End of pcre2_auto_possess.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_chartables.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_chartables.c new file mode 100644 index 0000000000..8cdba88dcf --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_chartables.c @@ -0,0 +1,202 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* This file was automatically written by the pcre2_dftables auxiliary +program. It contains character tables that are used when no external +tables are passed to PCRE2 by the application that calls it. The tables +are used only for characters whose code values are less than 256. */ + +/* This set of tables was written in the C locale. */ + +/* The pcre2_ftables program (which is distributed with PCRE2) can be used +to build alternative versions of this file. This is necessary if you are +running in an EBCDIC environment, or if you want to default to a different +encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates +these tables in the "C" locale by default. This happens automatically if +PCRE2 is configured with --enable-rebuild-chartables. However, you can run +pcre2_dftables manually with the -L option to build tables using the LC_ALL +locale. */ + +/* The following #include is present because without it gcc 4.x may remove +the array definition from the final binary if PCRE2 is built into a static +library and dead code stripping is activated. This leads to link errors. +Pulling in the header ensures that the array gets flagged as "someone +outside this compilation unit might reference this" and so it will always +be supplied to the linker. */ + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + +const uint8_t PRIV(default_tables)[] = { + +/* This table is a lower casing table. */ + + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122, 91, 92, 93, 94, 95, + 96, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122,123,124,125,126,127, + 128,129,130,131,132,133,134,135, + 136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151, + 152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167, + 168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183, + 184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199, + 200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215, + 216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231, + 232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247, + 248,249,250,251,252,253,254,255, + +/* This table is a case flipping table. */ + + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119, + 120,121,122, 91, 92, 93, 94, 95, + 96, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90,123,124,125,126,127, + 128,129,130,131,132,133,134,135, + 136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151, + 152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167, + 168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183, + 184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199, + 200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215, + 216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231, + 232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247, + 248,249,250,251,252,253,254,255, + +/* This table contains bit maps for various character classes. Each map is 32 +bytes long and the bits run from the least significant end of each byte. The +classes that have their own maps are: space, xdigit, digit, upper, lower, word, +graph, print, punct, and cntrl. Other classes are built from combinations. */ + + 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, /* space */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* xdigit */ + 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* digit */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* upper */ + 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* lower */ + 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* word */ + 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, /* graph */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, /* print */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, /* punct */ + 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + + 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, /* cntrl */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + +/* This table identifies various classes of character by individual bits: + 0x01 white space character + 0x02 letter + 0x04 lower case letter + 0x08 decimal digit + 0x10 alphanumeric or '_' +*/ + + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ + 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, /* 0 - 7 */ + 0x18,0x18,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ + 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* @ - G */ + 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */ + 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */ + 0x12,0x12,0x12,0x00,0x00,0x00,0x00,0x10, /* X - _ */ + 0x00,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* ` - g */ + 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* h - o */ + 0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* p - w */ + 0x16,0x16,0x16,0x00,0x00,0x00,0x00,0x00, /* x -127 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ + +/* End of pcre2_chartables.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_compile.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_compile.c new file mode 100644 index 0000000000..3c4461d81a --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_compile.c @@ -0,0 +1,10769 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2023 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#define NLBLOCK cb /* Block containing newline information */ +#define PSSTART start_pattern /* Field containing processed string start */ +#define PSEND end_pattern /* Field containing processed string end */ + +#include "regexp/pcre2/pcre2_internal.h" + +/* In rare error cases debugging might require calling pcre2_printint(). */ + +#if 0 +#ifdef EBCDIC +#define PRINTABLE(c) ((c) >= 64 && (c) < 255) +#else +#define PRINTABLE(c) ((c) >= 32 && (c) < 127) +#endif +#include "regexp/pcre2/pcre2_printint.c" +#define DEBUG_CALL_PRINTINT +#endif + +/* Other debugging code can be enabled by these defines. */ + +/* #define DEBUG_SHOW_CAPTURES */ +/* #define DEBUG_SHOW_PARSED */ + +/* There are a few things that vary with different code unit sizes. Handle them +by defining macros in order to minimize #if usage. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5 +#define XDIGIT(c) xdigitab[c] + +#else /* Either 16-bit or 32-bit */ +#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff) + +#if PCRE2_CODE_UNIT_WIDTH == 16 +#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6 + +#else /* 32-bit */ +#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6 +#endif +#endif + +/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which +consists of uint32_t elements. Assume that if uint32_t can't hold it, two of +them will be able to (i.e. assume a 64-bit world). */ + +#if PCRE2_SIZE_MAX <= UINT32_MAX +#define PUTOFFSET(s,p) *p++ = s +#define GETOFFSET(s,p) s = *p++ +#define GETPLUSOFFSET(s,p) s = *(++p) +#define READPLUSOFFSET(s,p) s = p[1] +#define SKIPOFFSET(p) p++ +#define SIZEOFFSET 1 +#else +#define PUTOFFSET(s,p) \ + { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); } +#define GETOFFSET(s,p) \ + { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; } +#define GETPLUSOFFSET(s,p) \ + { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; } +#define READPLUSOFFSET(s,p) \ + { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } +#define SKIPOFFSET(p) p += 2 +#define SIZEOFFSET 2 +#endif + +/* Macros for manipulating elements of the parsed pattern vector. */ + +#define META_CODE(x) (x & 0xffff0000u) +#define META_DATA(x) (x & 0x0000ffffu) +#define META_DIFF(x,y) ((x-y)>>16) + +/* Function definitions to allow mutual recursion */ + +#ifdef SUPPORT_UNICODE +static unsigned int + add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t, + compile_block *, const uint32_t *, unsigned int); +#endif + +static int + compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *, + uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *, + compile_block *, PCRE2_SIZE *); + +static int + get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *, + compile_block *); + +static BOOL + set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, + compile_block *); + +static int + check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *, + compile_block *, int *); + + +/************************************************* +* Code parameters and static tables * +*************************************************/ + +#define MAX_GROUP_NUMBER 65535u +#define MAX_REPEAT_COUNT 65535u +#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1) + +/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in +different ways in the different pattern scans. The parsing and group- +identifying pre-scan uses it to handle nesting, and needs it to be 16-bit +aligned for this. Having defined the size in code units, we set up +C16_WORK_SIZE as the number of elements in the 16-bit vector. + +During the first compiling phase, when determining how much memory is required, +the regex is partly compiled into this space, but the compiled parts are +discarded as soon as they can be, so that hopefully there will never be an +overrun. The code does, however, check for an overrun, which can occur for +pathological patterns. The size of the workspace depends on LINK_SIZE because +the length of compiled items varies with this. + +In the real compile phase, this workspace is not currently used. */ + +#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */ + +#define C16_WORK_SIZE \ + ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t)) + +/* A uint32_t vector is used for caching information about the size of +capturing groups, to improve performance. A default is created on the stack of +this size. */ + +#define GROUPINFO_DEFAULT_SIZE 256 + +/* The overrun tests check for a slightly smaller size so that they detect the +overrun before it actually does run off the end of the data block. */ + +#define WORK_SIZE_SAFETY_MARGIN (100) + +/* This value determines the size of the initial vector that is used for +remembering named groups during the pre-compile. It is allocated on the stack, +but if it is too small, it is expanded, in a similar way to the workspace. The +value is the number of slots in the list. */ + +#define NAMED_GROUP_LIST_SIZE 20 + +/* The pre-compiling pass over the pattern creates a parsed pattern in a vector +of uint32_t. For short patterns this lives on the stack, with this size. Heap +memory is used for longer patterns. */ + +#define PARSED_PATTERN_DEFAULT_SIZE 1024 + +/* Maximum length value to check against when making sure that the variable +that holds the compiled pattern length does not overflow. We make it a bit less +than INT_MAX to allow for adding in group terminating code units, so that we +don't have to check them every time. */ + +#define OFLOW_MAX (INT_MAX - 20) + +/* Code values for parsed patterns, which are stored in a vector of 32-bit +unsigned ints. Values less than META_END are literal data values. The coding +for identifying the item is in the top 16-bits, leaving 16 bits for the +additional data that some of them need. The META_CODE, META_DATA, and META_DIFF +macros are used to manipulate parsed pattern elements. + +NOTE: When these definitions are changed, the table of extra lengths for each +code (meta_extra_lengths, just below) must be updated to remain in step. */ + +#define META_END 0x80000000u /* End of pattern */ + +#define META_ALT 0x80010000u /* alternation */ +#define META_ATOMIC 0x80020000u /* atomic group */ +#define META_BACKREF 0x80030000u /* Back ref */ +#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ +#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ +#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ +#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ +#define META_CAPTURE 0x80080000u /* Capturing parenthesis */ +#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ +#define META_CLASS 0x800a0000u /* start non-empty class */ +#define META_CLASS_EMPTY 0x800b0000u /* empty class */ +#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ +#define META_CLASS_END 0x800d0000u /* end of non-empty class */ +#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ +#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ +#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ +#define META_COND_NAME 0x80110000u /* (?()... */ +#define META_COND_NUMBER 0x80120000u /* (?(digits)... */ +#define META_COND_RNAME 0x80130000u /* (?(R&name)... */ +#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ +#define META_COND_VERSION 0x80150000u /* (?(VERSIONx.y)... */ +#define META_DOLLAR 0x80160000u /* $ metacharacter */ +#define META_DOT 0x80170000u /* . metacharacter */ +#define META_ESCAPE 0x80180000u /* \d and friends */ +#define META_KET 0x80190000u /* closing parenthesis */ +#define META_NOCAPTURE 0x801a0000u /* no capture parens */ +#define META_OPTIONS 0x801b0000u /* (?i) and friends */ +#define META_POSIX 0x801c0000u /* POSIX class item */ +#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */ +#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */ +#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */ +#define META_RECURSE 0x80200000u /* Recursion */ +#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */ +#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */ + +/* These must be kept together to make it easy to check that an assertion +is present where expected in a conditional group. */ + +#define META_LOOKAHEAD 0x80230000u /* (?= */ +#define META_LOOKAHEADNOT 0x80240000u /* (?! */ +#define META_LOOKBEHIND 0x80250000u /* (?<= */ +#define META_LOOKBEHINDNOT 0x80260000u /* (?= 10 */ + 1+SIZEOFFSET, /* META_BACKREF_BYNAME */ + 1, /* META_BIGVALUE */ + 3, /* META_CALLOUT_NUMBER */ + 3+SIZEOFFSET, /* META_CALLOUT_STRING */ + 0, /* META_CAPTURE */ + 0, /* META_CIRCUMFLEX */ + 0, /* META_CLASS */ + 0, /* META_CLASS_EMPTY */ + 0, /* META_CLASS_EMPTY_NOT */ + 0, /* META_CLASS_END */ + 0, /* META_CLASS_NOT */ + 0, /* META_COND_ASSERT */ + SIZEOFFSET, /* META_COND_DEFINE */ + 1+SIZEOFFSET, /* META_COND_NAME */ + 1+SIZEOFFSET, /* META_COND_NUMBER */ + 1+SIZEOFFSET, /* META_COND_RNAME */ + 1+SIZEOFFSET, /* META_COND_RNUMBER */ + 3, /* META_COND_VERSION */ + 0, /* META_DOLLAR */ + 0, /* META_DOT */ + 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */ + 0, /* META_KET */ + 0, /* META_NOCAPTURE */ + 1, /* META_OPTIONS */ + 1, /* META_POSIX */ + 1, /* META_POSIX_NEG */ + 0, /* META_RANGE_ESCAPED */ + 0, /* META_RANGE_LITERAL */ + SIZEOFFSET, /* META_RECURSE */ + 1+SIZEOFFSET, /* META_RECURSE_BYNAME */ + 0, /* META_SCRIPT_RUN */ + 0, /* META_LOOKAHEAD */ + 0, /* META_LOOKAHEADNOT */ + SIZEOFFSET, /* META_LOOKBEHIND */ + SIZEOFFSET, /* META_LOOKBEHINDNOT */ + 0, /* META_LOOKAHEAD_NA */ + SIZEOFFSET, /* META_LOOKBEHIND_NA */ + 1, /* META_MARK - plus the string length */ + 0, /* META_ACCEPT */ + 0, /* META_FAIL */ + 0, /* META_COMMIT */ + 1, /* META_COMMIT_ARG - plus the string length */ + 0, /* META_PRUNE */ + 1, /* META_PRUNE_ARG - plus the string length */ + 0, /* META_SKIP */ + 1, /* META_SKIP_ARG - plus the string length */ + 0, /* META_THEN */ + 1, /* META_THEN_ARG - plus the string length */ + 0, /* META_ASTERISK */ + 0, /* META_ASTERISK_PLUS */ + 0, /* META_ASTERISK_QUERY */ + 0, /* META_PLUS */ + 0, /* META_PLUS_PLUS */ + 0, /* META_PLUS_QUERY */ + 0, /* META_QUERY */ + 0, /* META_QUERY_PLUS */ + 0, /* META_QUERY_QUERY */ + 2, /* META_MINMAX */ + 2, /* META_MINMAX_PLUS */ + 2 /* META_MINMAX_QUERY */ +}; + +/* Types for skipping parts of a parsed pattern. */ + +enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET }; + +/* Macro for setting individual bits in class bitmaps. It took some +experimenting to figure out how to stop gcc 5.3.0 from warning with +-Wconversion. This version gets a warning: + + #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7)) + +Let's hope the apparently less efficient version isn't actually so bad if the +compiler is clever with identical subexpressions. */ + +#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7))) + +/* Values and flags for the unsigned xxcuflags variables that accompany xxcu +variables, which are concerned with first and required code units. A value +greater than or equal to REQ_NONE means "no code unit set"; otherwise the +matching xxcu variable is set, and the low valued bits are relevant. */ + +#define REQ_UNSET 0xffffffffu /* Not yet found anything */ +#define REQ_NONE 0xfffffffeu /* Found not fixed character */ +#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */ +#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */ + +/* These flags are used in the groupinfo vector. */ + +#define GI_SET_FIXED_LENGTH 0x80000000u +#define GI_NOT_FIXED_LENGTH 0x40000000u +#define GI_FIXED_LENGTH_MASK 0x0000ffffu + +/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC +and is fast (a good compiler can turn it into a subtraction and unsigned +comparison). */ + +#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) + +/* Table to identify hex digits. The tables in chartables are dependent on the +locale, and may mark arbitrary characters as digits. We want to recognize only +0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It +costs 256 bytes, but it is a lot faster than doing character value tests (at +least in some simple cases I timed), and in some applications one wants PCRE2 +to compile efficiently as well as match efficiently. The value in the table is +the binary hex digit value, or 0xff for non-hex digits. */ + +/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in +UTF-8 mode. */ + +#ifndef EBCDIC +static const uint8_t xdigitab[] = + { + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */ + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */ + 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */ + 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */ + 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */ + +#else + +/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ + +static const uint8_t xdigitab[] = + { + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ + 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ + 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ + 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ +#endif /* EBCDIC */ + + +/* Table for handling alphanumeric escaped characters. Positive returns are +simple data values; negative values are for special things like \d and so on. +Zero means further processing is needed (for things like \x), or the escape is +invalid. */ + +/* This is the "normal" table for ASCII systems or for EBCDIC systems running +in UTF-8 mode. It runs from '0' to 'z'. */ + +#ifndef EBCDIC +#define ESCAPES_FIRST CHAR_0 +#define ESCAPES_LAST CHAR_z +#define UPPER_CASE(c) (c-32) + +static const short int escapes[] = { + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + CHAR_COLON, CHAR_SEMICOLON, + CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, + CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, + CHAR_COMMERCIAL_AT, -ESC_A, + -ESC_B, -ESC_C, + -ESC_D, -ESC_E, + 0, -ESC_G, + -ESC_H, 0, + 0, -ESC_K, + 0, 0, + -ESC_N, 0, + -ESC_P, -ESC_Q, + -ESC_R, -ESC_S, + 0, 0, + -ESC_V, -ESC_W, + -ESC_X, 0, + -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, + CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, + CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, + CHAR_GRAVE_ACCENT, CHAR_BEL, + -ESC_b, 0, + -ESC_d, CHAR_ESC, + CHAR_FF, 0, + -ESC_h, 0, + 0, -ESC_k, + 0, 0, + CHAR_LF, 0, + -ESC_p, 0, + CHAR_CR, -ESC_s, + CHAR_HT, 0, + -ESC_v, -ESC_w, + 0, 0, + -ESC_z +}; + +#else + +/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. +It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code +is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a +because it is defined as 'a', which of course picks up the ASCII value. */ + +#if 'a' == 0x81 /* Check for a real EBCDIC environment */ +#define ESCAPES_FIRST CHAR_a +#define ESCAPES_LAST CHAR_9 +#define UPPER_CASE(c) (c+64) +#else /* Testing in an ASCII environment */ +#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ +#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ +#define UPPER_CASE(c) (c-32) +#endif + +static const short int escapes[] = { +/* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0, +/* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0, +/* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p, +/* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0, +/* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0, +/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0, +/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', +/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, +/* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0, +/* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P, +/* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0, +/* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X, +/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0, +/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, +/* F8 */ 0, 0 +}; + +/* We also need a table of characters that may follow \c in an EBCDIC +environment for characters 0-31. */ + +static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; + +#endif /* EBCDIC */ + + +/* Table of special "verbs" like (*PRUNE). This is a short table, so it is +searched linearly. Put all the names into a single string, in order to reduce +the number of relocations when a shared library is dynamically linked. The +string is built from string macros so that it works in UTF-8 mode on EBCDIC +platforms. */ + +typedef struct verbitem { + unsigned int len; /* Length of verb name */ + uint32_t meta; /* Base META_ code */ + int has_arg; /* Argument requirement */ +} verbitem; + +static const char verbnames[] = + "\0" /* Empty name is a shorthand for MARK */ + STRING_MARK0 + STRING_ACCEPT0 + STRING_F0 + STRING_FAIL0 + STRING_COMMIT0 + STRING_PRUNE0 + STRING_SKIP0 + STRING_THEN; + +static const verbitem verbs[] = { + { 0, META_MARK, +1 }, /* > 0 => must have an argument */ + { 4, META_MARK, +1 }, + { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */ + { 1, META_FAIL, -1 }, + { 4, META_FAIL, -1 }, + { 6, META_COMMIT, 0 }, + { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */ + { 4, META_SKIP, 0 }, + { 4, META_THEN, 0 } +}; + +static const int verbcount = sizeof(verbs)/sizeof(verbitem); + +/* Verb opcodes, indexed by their META code offset from META_MARK. */ + +static const uint32_t verbops[] = { + OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE, + OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG }; + +/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */ + +typedef struct alasitem { + unsigned int len; /* Length of name */ + uint32_t meta; /* Base META_ code */ +} alasitem; + +static const char alasnames[] = + STRING_pla0 + STRING_plb0 + STRING_napla0 + STRING_naplb0 + STRING_nla0 + STRING_nlb0 + STRING_positive_lookahead0 + STRING_positive_lookbehind0 + STRING_non_atomic_positive_lookahead0 + STRING_non_atomic_positive_lookbehind0 + STRING_negative_lookahead0 + STRING_negative_lookbehind0 + STRING_atomic0 + STRING_sr0 + STRING_asr0 + STRING_script_run0 + STRING_atomic_script_run; + +static const alasitem alasmeta[] = { + { 3, META_LOOKAHEAD }, + { 3, META_LOOKBEHIND }, + { 5, META_LOOKAHEAD_NA }, + { 5, META_LOOKBEHIND_NA }, + { 3, META_LOOKAHEADNOT }, + { 3, META_LOOKBEHINDNOT }, + { 18, META_LOOKAHEAD }, + { 19, META_LOOKBEHIND }, + { 29, META_LOOKAHEAD_NA }, + { 30, META_LOOKBEHIND_NA }, + { 18, META_LOOKAHEADNOT }, + { 19, META_LOOKBEHINDNOT }, + { 6, META_ATOMIC }, + { 2, META_SCRIPT_RUN }, /* sr = script run */ + { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */ + { 10, META_SCRIPT_RUN }, /* script run */ + { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */ +}; + +static const int alascount = sizeof(alasmeta)/sizeof(alasitem); + +/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */ + +static uint32_t chartypeoffset[] = { + OP_STAR - OP_STAR, OP_STARI - OP_STAR, + OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR }; + +/* Tables of names of POSIX character classes and their lengths. The names are +now all in a single string, to reduce the number of relocations when a shared +library is dynamically loaded. The list of lengths is terminated by a zero +length entry. The first three must be alpha, lower, upper, as this is assumed +for handling case independence. The indices for several classes are needed, so +identify them. */ + +static const char posix_names[] = + STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 + STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 + STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 + STRING_word0 STRING_xdigit; + +static const uint8_t posix_name_lengths[] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; + +#define PC_GRAPH 8 +#define PC_PRINT 9 +#define PC_PUNCT 10 + +/* Table of class bit maps for each POSIX class. Each class is formed from a +base map, with an optional addition or removal of another map. Then, for some +classes, there is some additional tweaking: for [:blank:] the vertical space +characters are removed, and for [:alpha:] and [:alnum:] the underscore +character is removed. The triples in the table consist of the base map offset, +second map offset or -1 if no second map, and a non-negative value for map +addition or a negative value for map subtraction (if there are two maps). The +absolute value of the third field has these meanings: 0 => no tweaking, 1 => +remove vertical space characters, 2 => remove underscore. */ + +static const int posix_class_maps[] = { + cbit_word, cbit_digit, -2, /* alpha */ + cbit_lower, -1, 0, /* lower */ + cbit_upper, -1, 0, /* upper */ + cbit_word, -1, 2, /* alnum - word without underscore */ + cbit_print, cbit_cntrl, 0, /* ascii */ + cbit_space, -1, 1, /* blank - a GNU extension */ + cbit_cntrl, -1, 0, /* cntrl */ + cbit_digit, -1, 0, /* digit */ + cbit_graph, -1, 0, /* graph */ + cbit_print, -1, 0, /* print */ + cbit_punct, -1, 0, /* punct */ + cbit_space, -1, 0, /* space */ + cbit_word, -1, 0, /* word - a Perl extension */ + cbit_xdigit,-1, 0 /* xdigit */ +}; + +#ifdef SUPPORT_UNICODE + +/* The POSIX class Unicode property substitutes that are used in UCP mode must +be in the order of the POSIX class names, defined above. */ + +static int posix_substitutes[] = { + PT_GC, ucp_L, /* alpha */ + PT_PC, ucp_Ll, /* lower */ + PT_PC, ucp_Lu, /* upper */ + PT_ALNUM, 0, /* alnum */ + -1, 0, /* ascii, treat as non-UCP */ + -1, 1, /* blank, treat as \h */ + PT_PC, ucp_Cc, /* cntrl */ + PT_PC, ucp_Nd, /* digit */ + PT_PXGRAPH, 0, /* graph */ + PT_PXPRINT, 0, /* print */ + PT_PXPUNCT, 0, /* punct */ + PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */ + PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */ + -1, 0 /* xdigit, treat as non-UCP */ +}; +#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t))) +#endif /* SUPPORT_UNICODE */ + +/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset +are allowed. */ + +#define PUBLIC_LITERAL_COMPILE_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \ + PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \ + PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF) + +#define PUBLIC_COMPILE_OPTIONS \ + (PUBLIC_LITERAL_COMPILE_OPTIONS| \ + PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ + PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ + PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ + PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ + PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ + PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) + +#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ + (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT) + +#define PUBLIC_COMPILE_EXTRA_OPTIONS \ + (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ + PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \ + PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \ + PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX) + +/* Compile time error code numbers. They are given names so that they can more +easily be tracked. When a new number is added, the tables called eint1 and +eint2 in pcre2posix.c may need to be updated, and a new error text must be +added to compile_error_texts in pcre2_error.c. Also, the error codes in +pcre2.h.in must be updated - their values are exactly 100 greater than these +values. */ + +enum { ERR0 = COMPILE_ERROR_BASE, + ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, + ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, + ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, + ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, + ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, + ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, + ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, + ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, + ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90, + ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 }; + +/* This is a table of start-of-pattern options such as (*UTF) and settings such +as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward +compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is +generic and always supported. */ + +enum { PSO_OPT, /* Value is an option bit */ + PSO_FLG, /* Value is a flag bit */ + PSO_NL, /* Value is a newline type */ + PSO_BSR, /* Value is a \R type */ + PSO_LIMH, /* Read integer value for heap limit */ + PSO_LIMM, /* Read integer value for match limit */ + PSO_LIMD }; /* Read integer value for depth limit */ + +typedef struct pso { + const uint8_t *name; + uint16_t length; + uint16_t type; + uint32_t value; +} pso; + +/* NB: STRING_UTFn_RIGHTPAR contains the length as well */ + +static pso pso_list[] = { + { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, + { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, + { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, + { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, + { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, + { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, + { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, + { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, + { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, + { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, + { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, + { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, + { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 }, + { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, + { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, + { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, + { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, + { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL }, + { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, + { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, + { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } +}; + +/* This table is used when converting repeating opcodes into possessified +versions as a result of an explicit possessive quantifier such as ++. A zero +value means there is no possessified version - in those cases the item in +question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT +because all relevant opcodes are less than that. */ + +static const uint8_t opcode_possessify[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ + + 0, /* NOTI */ + OP_POSSTAR, 0, /* STAR, MINSTAR */ + OP_POSPLUS, 0, /* PLUS, MINPLUS */ + OP_POSQUERY, 0, /* QUERY, MINQUERY */ + OP_POSUPTO, 0, /* UPTO, MINUPTO */ + 0, /* EXACT */ + 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ + + OP_POSSTARI, 0, /* STARI, MINSTARI */ + OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ + OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ + OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ + 0, /* EXACTI */ + 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ + + OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ + OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ + OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ + OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ + 0, /* NOTEXACT */ + 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ + + OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ + OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ + OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ + OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ + 0, /* NOTEXACTI */ + 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ + + OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ + OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ + OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ + OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ + 0, /* TYPEEXACT */ + 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ + + OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ + OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ + OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ + OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ + 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ + + 0, 0, 0, /* CLASS, NCLASS, XCLASS */ + 0, 0, /* REF, REFI */ + 0, 0, /* DNREF, DNREFI */ + 0, 0 /* RECURSE, CALLOUT */ +}; + + +#ifdef DEBUG_SHOW_PARSED +/************************************************* +* Show the parsed pattern for debugging * +*************************************************/ + +/* For debugging the pre-scan, this code, which outputs the parsed data vector, +can be enabled. */ + +static void show_parsed(compile_block *cb) +{ +uint32_t *pptr = cb->parsed_pattern; + +for (;;) + { + int max, min; + PCRE2_SIZE offset; + uint32_t i; + uint32_t length; + uint32_t meta_arg = META_DATA(*pptr); + + fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr); + + if (*pptr < META_END) + { + if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr); + pptr++; + } + + else switch (META_CODE(*pptr++)) + { + default: + fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n"); + return; + + case META_END: + fprintf(stderr, "META_END\n"); + return; + + case META_CAPTURE: + fprintf(stderr, "META_CAPTURE %d", meta_arg); + break; + + case META_RECURSE: + GETOFFSET(offset, pptr); + fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); + break; + + case META_BACKREF: + if (meta_arg < 10) + offset = cb->small_ref_offset[meta_arg]; + else + GETOFFSET(offset, pptr); + fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); + break; + + case META_ESCAPE: + if (meta_arg == ESC_P || meta_arg == ESC_p) + { + uint32_t ptype = *pptr >> 16; + uint32_t pvalue = *pptr++ & 0xffff; + fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p', + ptype, pvalue); + } + else + { + uint32_t cc; + /* There's just one escape we might have here that isn't negated in the + escapes table. */ + if (meta_arg == ESC_g) cc = CHAR_g; + else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++) + { + if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break; + } + if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK; + fprintf(stderr, "META \\%c", cc); + } + break; + + case META_MINMAX: + min = *pptr++; + max = *pptr++; + if (max != REPEAT_UNLIMITED) + fprintf(stderr, "META {%d,%d}", min, max); + else + fprintf(stderr, "META {%d,}", min); + break; + + case META_MINMAX_QUERY: + min = *pptr++; + max = *pptr++; + if (max != REPEAT_UNLIMITED) + fprintf(stderr, "META {%d,%d}?", min, max); + else + fprintf(stderr, "META {%d,}?", min); + break; + + case META_MINMAX_PLUS: + min = *pptr++; + max = *pptr++; + if (max != REPEAT_UNLIMITED) + fprintf(stderr, "META {%d,%d}+", min, max); + else + fprintf(stderr, "META {%d,}+", min); + break; + + case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break; + case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break; + case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break; + case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break; + case META_DOT: fprintf(stderr, "META_DOT"); break; + case META_ASTERISK: fprintf(stderr, "META *"); break; + case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break; + case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break; + case META_PLUS: fprintf(stderr, "META +"); break; + case META_PLUS_QUERY: fprintf(stderr, "META +?"); break; + case META_PLUS_PLUS: fprintf(stderr, "META ++"); break; + case META_QUERY: fprintf(stderr, "META ?"); break; + case META_QUERY_QUERY: fprintf(stderr, "META ??"); break; + case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break; + + case META_ATOMIC: fprintf(stderr, "META (?>"); break; + case META_NOCAPTURE: fprintf(stderr, "META (?:"); break; + case META_LOOKAHEAD: fprintf(stderr, "META (?="); break; + case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break; + case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break; + case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break; + case META_KET: fprintf(stderr, "META )"); break; + case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; + + case META_CLASS: fprintf(stderr, "META ["); break; + case META_CLASS_NOT: fprintf(stderr, "META [^"); break; + case META_CLASS_END: fprintf(stderr, "META ]"); break; + case META_CLASS_EMPTY: fprintf(stderr, "META []"); break; + case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break; + + case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; + case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; + + case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; + case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; + + case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; + case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; + case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break; + case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break; + case META_SKIP: fprintf(stderr, "META (*SKIP)"); break; + case META_THEN: fprintf(stderr, "META (*THEN)"); break; + + case META_OPTIONS: + fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]); + pptr += 2; + break; + + case META_LOOKBEHIND: + fprintf(stderr, "META (?<= %d offset=", meta_arg); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_LOOKBEHIND_NA: + fprintf(stderr, "META (*naplb: %d offset=", meta_arg); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_LOOKBEHINDNOT: + fprintf(stderr, "META (?="); + fprintf(stderr, "%d.", *pptr++); + fprintf(stderr, "%d)", *pptr++); + break; + + case META_COND_NAME: + fprintf(stderr, "META (?() length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_COND_RNAME: + fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + /* This is kept as a name, because it might be. */ + + case META_COND_RNUMBER: + fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_MARK: + fprintf(stderr, "META (*MARK:"); + goto SHOWARG; + + case META_COMMIT_ARG: + fprintf(stderr, "META (*COMMIT:"); + goto SHOWARG; + + case META_PRUNE_ARG: + fprintf(stderr, "META (*PRUNE:"); + goto SHOWARG; + + case META_SKIP_ARG: + fprintf(stderr, "META (*SKIP:"); + goto SHOWARG; + + case META_THEN_ARG: + fprintf(stderr, "META (*THEN:"); + SHOWARG: + length = *pptr++; + for (i = 0; i < length; i++) + { + uint32_t cc = *pptr++; + if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc); + else fprintf(stderr, "\\x{%x}", cc); + } + fprintf(stderr, ") length=%u", length); + break; + } + fprintf(stderr, "\n"); + } +return; +} +#endif /* DEBUG_SHOW_PARSED */ + + + +/************************************************* +* Copy compiled code * +*************************************************/ + +/* Compiled JIT code cannot be copied, so the new compiled block has no +associated JIT data. */ + +PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION +pcre2_code_copy(const pcre2_code *code) +{ +PCRE2_SIZE* ref_count; +pcre2_code *newcode; + +if (code == NULL) return NULL; +newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); +if (newcode == NULL) return NULL; +memcpy(newcode, code, code->blocksize); +newcode->executable_jit = NULL; + +/* If the code is one that has been deserialized, increment the reference count +in the decoded tables. */ + +if ((code->flags & PCRE2_DEREF_TABLES) != 0) + { + ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); + (*ref_count)++; + } + +return newcode; +} + + + +/************************************************* +* Copy compiled code and character tables * +*************************************************/ + +/* Compiled JIT code cannot be copied, so the new compiled block has no +associated JIT data. This version of code_copy also makes a separate copy of +the character tables. */ + +PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION +pcre2_code_copy_with_tables(const pcre2_code *code) +{ +PCRE2_SIZE* ref_count; +pcre2_code *newcode; +uint8_t *newtables; + +if (code == NULL) return NULL; +newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); +if (newcode == NULL) return NULL; +memcpy(newcode, code, code->blocksize); +newcode->executable_jit = NULL; + +newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), + code->memctl.memory_data); +if (newtables == NULL) + { + code->memctl.free((void *)newcode, code->memctl.memory_data); + return NULL; + } +memcpy(newtables, code->tables, TABLES_LENGTH); +ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH); +*ref_count = 1; + +newcode->tables = newtables; +newcode->flags |= PCRE2_DEREF_TABLES; +return newcode; +} + + + +/************************************************* +* Free compiled code * +*************************************************/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_code_free(pcre2_code *code) +{ +PCRE2_SIZE* ref_count; + +if (code != NULL) + { +#ifdef SUPPORT_JIT + if (code->executable_jit != NULL) + PRIV(jit_free)(code->executable_jit, &code->memctl); +#endif + + if ((code->flags & PCRE2_DEREF_TABLES) != 0) + { + /* Decoded tables belong to the codes after deserialization, and they must + be freed when there are no more references to them. The *ref_count should + always be > 0. */ + + ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); + if (*ref_count > 0) + { + (*ref_count)--; + if (*ref_count == 0) + code->memctl.free((void *)code->tables, code->memctl.memory_data); + } + } + + code->memctl.free(code, code->memctl.memory_data); + } +} + + + +/************************************************* +* Read a number, possibly signed * +*************************************************/ + +/* This function is used to read numbers in the pattern. The initial pointer +must be the sign or first digit of the number. When relative values (introduced +by + or -) are allowed, they are relative group numbers, and the result must be +greater than zero. + +Arguments: + ptrptr points to the character pointer variable + ptrend points to the end of the input string + allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this + max_value the largest number allowed + max_error the error to give for an over-large number + intptr where to put the result + errcodeptr where to put an error code + +Returns: TRUE - a number was read + FALSE - errorcode == 0 => no number was found + errorcode != 0 => an error occurred +*/ + +static BOOL +read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign, + uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr) +{ +int sign = 0; +uint32_t n = 0; +PCRE2_SPTR ptr = *ptrptr; +BOOL yield = FALSE; + +*errorcodeptr = 0; + +if (allow_sign >= 0 && ptr < ptrend) + { + if (*ptr == CHAR_PLUS) + { + sign = +1; + max_value -= allow_sign; + ptr++; + } + else if (*ptr == CHAR_MINUS) + { + sign = -1; + ptr++; + } + } + +if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE; +while (ptr < ptrend && IS_DIGIT(*ptr)) + { + n = n * 10 + *ptr++ - CHAR_0; + if (n > max_value) + { + *errorcodeptr = max_error; + goto EXIT; + } + } + +if (allow_sign >= 0 && sign != 0) + { + if (n == 0) + { + *errorcodeptr = ERR26; /* +0 and -0 are not allowed */ + goto EXIT; + } + + if (sign > 0) n += allow_sign; + else if ((int)n > allow_sign) + { + *errorcodeptr = ERR15; /* Non-existent subpattern */ + goto EXIT; + } + else n = allow_sign + 1 - n; + } + +yield = TRUE; + +EXIT: +*intptr = n; +*ptrptr = ptr; +return yield; +} + + + +/************************************************* +* Read repeat counts * +*************************************************/ + +/* Read an item of the form {n,m} and return the values if non-NULL pointers +are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a +larger value is used for "unlimited". We have to use signed arguments for +read_number() because it is capable of returning a signed value. + +Arguments: + ptrptr points to pointer to character after'{' + ptrend pointer to end of input + minp if not NULL, pointer to int for min + maxp if not NULL, pointer to int for max (-1 if no max) + returned as -1 if no max + errorcodeptr points to error code variable + +Returns: FALSE if not a repeat quantifier, errorcode set zero + FALSE on error, with errorcode set non-zero + TRUE on success, with pointer updated to point after '}' +*/ + +static BOOL +read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp, + uint32_t *maxp, int *errorcodeptr) +{ +PCRE2_SPTR p; +BOOL yield = FALSE; +BOOL had_comma = FALSE; +int32_t min = 0; +int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */ + +/* Check the syntax */ + +*errorcodeptr = 0; +for (p = *ptrptr;; p++) + { + uint32_t c; + if (p >= ptrend) return FALSE; + c = *p; + if (IS_DIGIT(c)) continue; + if (c == CHAR_RIGHT_CURLY_BRACKET) break; + if (c == CHAR_COMMA) + { + if (had_comma) return FALSE; + had_comma = TRUE; + } + else return FALSE; + } + +/* The only error from read_number() is for a number that is too big. */ + +p = *ptrptr; +if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr)) + goto EXIT; + +if (*p == CHAR_RIGHT_CURLY_BRACKET) + { + p++; + max = min; + } +else + { + if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) + { + if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, + errorcodeptr)) + goto EXIT; + if (max < min) + { + *errorcodeptr = ERR4; + goto EXIT; + } + } + p++; + } + +yield = TRUE; +if (minp != NULL) *minp = (uint32_t)min; +if (maxp != NULL) *maxp = (uint32_t)max; + +/* Update the pattern pointer */ + +EXIT: +*ptrptr = p; +return yield; +} + + + +/************************************************* +* Handle escapes * +*************************************************/ + +/* This function is called when a \ has been encountered. It either returns a +positive value for a simple escape such as \d, or 0 for a data character, which +is placed in chptr. A backreference to group n is returned as negative n. On +entry, ptr is pointing at the character after \. On exit, it points after the +final code unit of the escape sequence. + +This function is also called from pcre2_substitute() to handle escape sequences +in replacement strings. In this case, the cb argument is NULL, and in the case +of escapes that have further processing, only sequences that define a data +character are recognised. The isclass argument is not relevant; the options +argument is the final value of the compiled pattern's options. + +Arguments: + ptrptr points to the input position pointer + ptrend points to the end of the input + chptr points to a returned data character + errorcodeptr points to the errorcode variable (containing zero) + options the current options bits + xoptions the current extra options bits + isclass TRUE if inside a character class + cb compile data block or NULL when called from pcre2_substitute() + +Returns: zero => a data character + positive => a special escape sequence + negative => a numerical back reference + on error, errorcodeptr is set non-zero +*/ + +int +PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, + int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass, + compile_block *cb) +{ +BOOL utf = (options & PCRE2_UTF) != 0; +PCRE2_SPTR ptr = *ptrptr; +uint32_t c, cc; +int escape = 0; +int i; + +/* If backslash is at the end of the string, it's an error. */ + +if (ptr >= ptrend) + { + *errorcodeptr = ERR1; + return 0; + } + +GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ +*errorcodeptr = 0; /* Be optimistic */ + +/* Non-alphanumerics are literals, so we just leave the value in c. An initial +value test saves a memory lookup for code points outside the alphanumeric +range. */ + +if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ + +/* Otherwise, do a table lookup. Non-zero values need little processing here. A +positive value is a literal value for something like \n. A negative value is +the negation of one of the ESC_ macros that is passed back for handling by the +calling function. Some extra checking is needed for \N because only \N{U+dddd} +is supported. If the value is zero, further processing is handled below. */ + +else if ((i = escapes[c - ESCAPES_FIRST]) != 0) + { + if (i > 0) + { + c = (uint32_t)i; + if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0) + c = CHAR_LF; + } + else /* Negative table entry */ + { + escape = -i; /* Else return a special escape */ + if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X)) + cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ + + /* Perl supports \N{name} for character names and \N{U+dddd} for numerical + Unicode code points, as well as plain \N for "not newline". PCRE does not + support \N{name}. However, it does support quantification such as \N{2,3}, + so if \N{ is not followed by U+dddd we check for a quantifier. */ + + if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) + { + PCRE2_SPTR p = ptr + 1; + + /* \N{U+ can be handled by the \x{ code. However, this construction is + not valid in EBCDIC environments because it specifies a Unicode + character, not a codepoint in the local code. For example \N{U+0041} + must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode + casing semantics for the entire pattern, so allow it only in UTF (i.e. + Unicode) mode. */ + + if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS) + { +#ifdef EBCDIC + *errorcodeptr = ERR93; +#else + if (utf) + { + ptr = p + 1; + escape = 0; /* Not a fancy escape after all */ + goto COME_FROM_NU; + } + else *errorcodeptr = ERR93; +#endif + } + + /* Give an error if what follows is not a quantifier, but don't override + an error set by the quantifier reader (e.g. number overflow). */ + + else + { + if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) && + *errorcodeptr == 0) + *errorcodeptr = ERR37; + } + } + } + } + +/* Escapes that need further processing, including those that are unknown, have +a zero entry in the lookup table. When called from pcre2_substitute(), only \c, +\o, and \x are recognized (\u and \U can never appear as they are used for case +forcing). */ + +else + { + int s; + PCRE2_SPTR oldptr; + BOOL overflow; + BOOL alt_bsux = + ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0; + + /* Filter calls from pcre2_substitute(). */ + + if (cb == NULL) + { + if (c != CHAR_c && c != CHAR_o && c != CHAR_x) + { + *errorcodeptr = ERR3; + return 0; + } + alt_bsux = FALSE; /* Do not modify \x handling */ + } + + switch (c) + { + /* A number of Perl escapes are not handled by PCRE. We give an explicit + error. */ + + case CHAR_F: + case CHAR_l: + case CHAR_L: + *errorcodeptr = ERR37; + break; + + /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX + is set. Otherwise, \u must be followed by exactly four hex digits or, if + PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces. + Otherwise it is a lowercase u letter. This gives some compatibility with + ECMAScript (aka JavaScript). */ + + case CHAR_u: + if (!alt_bsux) *errorcodeptr = ERR37; else + { + uint32_t xc; + + if (ptr >= ptrend) break; + if (*ptr == CHAR_LEFT_CURLY_BRACKET && + (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0) + { + PCRE2_SPTR hptr = ptr + 1; + cc = 0; + + while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff) + { + if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */ + { + *errorcodeptr = ERR77; + ptr = hptr; /* Show where */ + break; /* *hptr != } will cause another break below */ + } + cc = (cc << 4) | xc; + hptr++; + } + + if (hptr == ptr + 1 || /* No hex digits */ + hptr >= ptrend || /* Hit end of input */ + *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ + break; /* Hex escape not recognized */ + + c = cc; /* Accept the code point */ + ptr = hptr + 1; + } + + else /* Must be exactly 4 hex digits */ + { + if (ptrend - ptr < 4) break; /* Less than 4 chars */ + if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ + if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ + cc = (cc << 4) | xc; + if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ + cc = (cc << 4) | xc; + if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ + c = (cc << 4) | xc; + ptr += 4; + } + + if (utf) + { + if (c > 0x10ffffU) *errorcodeptr = ERR77; + else + if (c >= 0xd800 && c <= 0xdfff && + (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + *errorcodeptr = ERR73; + } + else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; + } + break; + + /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, + in which case it is an upper case letter. */ + + case CHAR_U: + if (!alt_bsux) *errorcodeptr = ERR37; + break; + + /* In a character class, \g is just a literal "g". Outside a character + class, \g must be followed by one of a number of specific things: + + (1) A number, either plain or braced. If positive, it is an absolute + backreference. If negative, it is a relative backreference. This is a Perl + 5.10 feature. + + (2) Perl 5.10 also supports \g{name} as a reference to a named group. This + is part of Perl's movement towards a unified syntax for back references. As + this is synonymous with \k{name}, we fudge it up by pretending it really + was \k{name}. + + (3) For Oniguruma compatibility we also support \g followed by a name or a + number either in angle brackets or in single quotes. However, these are + (possibly recursive) subroutine calls, _not_ backreferences. We return + the ESC_g code. + + Summary: Return a negative number for a numerical back reference, ESC_k for + a named back reference, and ESC_g for a named or numbered subroutine call. + */ + + case CHAR_g: + if (isclass) break; + + if (ptr >= ptrend) + { + *errorcodeptr = ERR57; + break; + } + + if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE) + { + escape = ESC_g; + break; + } + + /* If there is a brace delimiter, try to read a numerical reference. If + there isn't one, assume we have a name and treat it as \k. */ + + if (*ptr == CHAR_LEFT_CURLY_BRACKET) + { + PCRE2_SPTR p = ptr + 1; + if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, + errorcodeptr)) + { + if (*errorcodeptr == 0) escape = ESC_k; /* No number found */ + break; + } + if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) + { + *errorcodeptr = ERR57; + break; + } + ptr = p + 1; + } + + /* Read an undelimited number */ + + else + { + if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, + errorcodeptr)) + { + if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */ + break; + } + } + + if (s <= 0) + { + *errorcodeptr = ERR15; + break; + } + + escape = -s; + break; + + /* The handling of escape sequences consisting of a string of digits + starting with one that is not zero is not straightforward. Perl has changed + over the years. Nowadays \g{} for backreferences and \o{} for octal are + recommended to avoid the ambiguities in the old syntax. + + Outside a character class, the digits are read as a decimal number. If the + number is less than 10, or if there are that many previous extracting left + brackets, it is a back reference. Otherwise, up to three octal digits are + read to form an escaped character code. Thus \123 is likely to be octal 123 + (cf \0123, which is octal 012 followed by the literal 3). + + Inside a character class, \ followed by a digit is always either a literal + 8 or 9 or an octal number. */ + + case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: + case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: + + if (!isclass) + { + oldptr = ptr; + ptr--; /* Back to the digit */ + + /* As we know we are at a digit, the only possible error from + read_number() is a number that is too large to be a group number. In this + case we fall through handle this as not a group reference. If we have + read a small enough number, check for a back reference. + + \1 to \9 are always back references. \8x and \9x are too; \1x to \7x + are octal escapes if there are not that many previous captures. */ + + if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) && + (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)) + { + if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61; + else escape = -s; /* Indicates a back reference */ + break; + } + + ptr = oldptr; /* Put the pointer back and fall through */ + } + + /* Handle a digit following \ when the number is not a back reference, or + we are within a character class. If the first digit is 8 or 9, Perl used to + generate a binary zero and then treat the digit as a following literal. At + least by Perl 5.18 this changed so as not to insert the binary zero. */ + + if (c >= CHAR_8) break; + + /* Fall through */ + + /* \0 always starts an octal number, but we may drop through to here with a + larger first octal digit. The original code used just to take the least + significant 8 bits of octal numbers (I think this is what early Perls used + to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, + but no more than 3 octal digits. */ + + case CHAR_0: + c -= CHAR_0; + while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) + c = c * 8 + *ptr++ - CHAR_0; +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (!utf && c > 0xff) *errorcodeptr = ERR51; +#endif + break; + + /* \o is a relatively new Perl feature, supporting a more general way of + specifying character codes in octal. The only supported form is \o{ddd}. */ + + case CHAR_o: + if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET) + { + ptr--; + *errorcodeptr = ERR55; + } + else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) + *errorcodeptr = ERR78; + else + { + c = 0; + overflow = FALSE; + while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) + { + cc = *ptr++; + if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (c >= 0x20000000l) { overflow = TRUE; break; } +#endif + c = (c << 3) + (cc - CHAR_0); +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } +#elif PCRE2_CODE_UNIT_WIDTH == 16 + if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } +#elif PCRE2_CODE_UNIT_WIDTH == 32 + if (utf && c > 0x10ffffU) { overflow = TRUE; break; } +#endif + } + if (overflow) + { + while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; + *errorcodeptr = ERR34; + } + else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff && + (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + { + ptr--; + *errorcodeptr = ERR73; + } + } + else + { + ptr--; + *errorcodeptr = ERR64; + } + } + break; + + /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed + by two hexadecimal digits. Otherwise it is a lowercase x letter. */ + + case CHAR_x: + if (alt_bsux) + { + uint32_t xc; + if (ptrend - ptr < 2) break; /* Less than 2 characters */ + if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ + if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ + c = (cc << 4) | xc; + ptr += 2; + } + + /* Handle \x in Perl's style. \x{ddd} is a character code which can be + greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex + digits. If not, { used to be treated as a data character. However, Perl + seems to read hex digits up to the first non-such, and ignore the rest, so + that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE + now gives an error. */ + + else + { + if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) + { +#ifndef EBCDIC + COME_FROM_NU: +#endif + if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) + { + *errorcodeptr = ERR78; + break; + } + c = 0; + overflow = FALSE; + + while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff) + { + ptr++; + if (c == 0 && cc == 0) continue; /* Leading zeroes */ +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (c >= 0x10000000l) { overflow = TRUE; break; } +#endif + c = (c << 4) | cc; + if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) + { + overflow = TRUE; + break; + } + } + + if (overflow) + { + while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++; + *errorcodeptr = ERR34; + } + else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff && + (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + { + ptr--; + *errorcodeptr = ERR73; + } + } + + /* If the sequence of hex digits does not end with '}', give an error. + We used just to recognize this construct and fall through to the normal + \x handling, but nowadays Perl gives an error, which seems much more + sensible, so we do too. */ + + else + { + ptr--; + *errorcodeptr = ERR67; + } + } /* End of \x{} processing */ + + /* Read a up to two hex digits after \x */ + + else + { + c = 0; + if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ + ptr++; + c = cc; + if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ + ptr++; + c = (c << 4) | cc; + } /* End of \xdd handling */ + } /* End of Perl-style \x handling */ + break; + + /* The handling of \c is different in ASCII and EBCDIC environments. In an + ASCII (or Unicode) environment, an error is given if the character + following \c is not a printable ASCII character. Otherwise, the following + character is upper-cased if it is a letter, and after that the 0x40 bit is + flipped. The result is the value of the escape. + + In an EBCDIC environment the handling of \c is compatible with the + specification in the perlebcdic document. The following character must be + a letter or one of small number of special characters. These provide a + means of defining the character values 0-31. + + For testing the EBCDIC handling of \c in an ASCII environment, recognize + the EBCDIC value of 'c' explicitly. */ + +#if defined EBCDIC && 'a' != 0x81 + case 0x83: +#else + case CHAR_c: +#endif + if (ptr >= ptrend) + { + *errorcodeptr = ERR2; + break; + } + c = *ptr; + if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); + + /* Handle \c in an ASCII/Unicode environment. */ + +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ + { + *errorcodeptr = ERR68; + break; + } + c ^= 0x40; + + /* Handle \c in an EBCDIC environment. The special case \c? is converted to + 255 (0xff) or 95 (0x5f) if other characters suggest we are using the + POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.) + The other valid sequences correspond to a list of specific characters. */ + +#else + if (c == CHAR_QUESTION_MARK) + c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; + else + { + for (i = 0; i < 32; i++) + { + if (c == ebcdic_escape_c[i]) break; + } + if (i < 32) c = i; else *errorcodeptr = ERR68; + } +#endif /* EBCDIC */ + + ptr++; + break; + + /* Any other alphanumeric following \ is an error. Perl gives an error only + if in warning mode, but PCRE doesn't have a warning mode. */ + + default: + *errorcodeptr = ERR3; + *ptrptr = ptr - 1; /* Point to the character at fault */ + return 0; + } + } + +/* Set the pointer to the next character before returning. */ + +*ptrptr = ptr; +*chptr = c; +return escape; +} + + + +#ifdef SUPPORT_UNICODE +/************************************************* +* Handle \P and \p * +*************************************************/ + +/* This function is called after \P or \p has been encountered, provided that +PCRE2 is compiled with support for UTF and Unicode properties. On entry, the +contents of ptrptr are pointing after the P or p. On exit, it is left pointing +after the final code unit of the escape sequence. + +Arguments: + ptrptr the pattern position pointer + negptr a boolean that is set TRUE for negation else FALSE + ptypeptr an unsigned int that is set to the type value + pdataptr an unsigned int that is set to the detailed property value + errorcodeptr the error code variable + cb the compile data + +Returns: TRUE if the type value was found, or FALSE for an invalid type +*/ + +static BOOL +get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr, + uint16_t *pdataptr, int *errorcodeptr, compile_block *cb) +{ +PCRE2_UCHAR c; +PCRE2_SIZE i, bot, top; +PCRE2_SPTR ptr = *ptrptr; +PCRE2_UCHAR name[50]; +PCRE2_UCHAR *vptr = NULL; +uint16_t ptscript = PT_NOTSCRIPT; + +if (ptr >= cb->end_pattern) goto ERROR_RETURN; +c = *ptr++; +*negptr = FALSE; + +/* \P or \p can be followed by a name in {}, optionally preceded by ^ for +negation. */ + +if (c == CHAR_LEFT_CURLY_BRACKET) + { + if (ptr >= cb->end_pattern) goto ERROR_RETURN; + + if (*ptr == CHAR_CIRCUMFLEX_ACCENT) + { + *negptr = TRUE; + ptr++; + } + + for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) + { + if (ptr >= cb->end_pattern) goto ERROR_RETURN; + c = *ptr++; + while (c == '_' || c == '-' || isspace(c)) + { + if (ptr >= cb->end_pattern) goto ERROR_RETURN; + c = *ptr++; + } + if (c == CHAR_NUL) goto ERROR_RETURN; + if (c == CHAR_RIGHT_CURLY_BRACKET) break; + name[i] = tolower(c); + if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i; + } + + if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; + name[i] = 0; + } + +/* If { doesn't follow \p or \P there is just one following character, which +must be an ASCII letter. */ + +else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) + { + name[0] = tolower(c); + name[1] = 0; + } +else goto ERROR_RETURN; + +*ptrptr = ptr; + +/* If the property contains ':' or '=' we have class name and value separately +specified. The following are supported: + + . Bidi_Class (synonym bc), for which the property names are "bidi". + . Script (synonym sc) for which the property name is the script name + . Script_Extensions (synonym scx), ditto + +As this is a small number, we currently just check the names directly. If this +grows, a sorted table and a switch will be neater. + +For both the script properties, set a PT_xxx value so that (1) they can be +distinguished and (2) invalid script names that happen to be the name of +another property can be diagnosed. */ + +if (vptr != NULL) + { + int offset = 0; + PCRE2_UCHAR sname[8]; + + *vptr = 0; /* Terminate property name */ + if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 || + PRIV(strcmp_c8)(name, STRING_bc) == 0) + { + offset = 4; + sname[0] = CHAR_b; + sname[1] = CHAR_i; /* There is no strcpy_c8 function */ + sname[2] = CHAR_d; + sname[3] = CHAR_i; + } + + else if (PRIV(strcmp_c8)(name, STRING_script) == 0 || + PRIV(strcmp_c8)(name, STRING_sc) == 0) + ptscript = PT_SC; + + else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 || + PRIV(strcmp_c8)(name, STRING_scx) == 0) + ptscript = PT_SCX; + + else + { + *errorcodeptr = ERR47; + return FALSE; + } + + /* Adjust the string in name[] as needed */ + + memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR)); + if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR)); + } + +/* Search for a recognized property using binary chop. */ + +bot = 0; +top = PRIV(utt_size); + +while (bot < top) + { + int r; + i = (bot + top) >> 1; + r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); + + /* When a matching property is found, some extra checking is needed when the + \p{xx:yy} syntax is used and xx is either sc or scx. */ + + if (r == 0) + { + *pdataptr = PRIV(utt)[i].value; + if (vptr == NULL || ptscript == PT_NOTSCRIPT) + { + *ptypeptr = PRIV(utt)[i].type; + return TRUE; + } + + switch (PRIV(utt)[i].type) + { + case PT_SC: + *ptypeptr = PT_SC; + return TRUE; + + case PT_SCX: + *ptypeptr = ptscript; + return TRUE; + } + + break; /* Non-script found */ + } + + if (r > 0) bot = i + 1; else top = i; + } + +*errorcodeptr = ERR47; /* Unrecognized property */ +return FALSE; + +ERROR_RETURN: /* Malformed \P or \p */ +*errorcodeptr = ERR46; +*ptrptr = ptr; +return FALSE; +} +#endif + + + +/************************************************* +* Check for POSIX class syntax * +*************************************************/ + +/* This function is called when the sequence "[:" or "[." or "[=" is +encountered in a character class. It checks whether this is followed by a +sequence of characters terminated by a matching ":]" or ".]" or "=]". If we +reach an unescaped ']' without the special preceding character, return FALSE. + +Originally, this function only recognized a sequence of letters between the +terminators, but it seems that Perl recognizes any sequence of characters, +though of course unknown POSIX names are subsequently rejected. Perl gives an +"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE +didn't consider this to be a POSIX class. Likewise for [:1234:]. + +The problem in trying to be exactly like Perl is in the handling of escapes. We +have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX +class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +below handles the special cases \\ and \], but does not try to do any other +escape processing. This makes it different from Perl for cases such as +[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does +not recognize "l\ower". This is a lesser evil than not diagnosing bad classes +when Perl does, I think. + +A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. +It seems that the appearance of a nested POSIX class supersedes an apparent +external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or +a digit. This is handled by returning FALSE if the start of a new group with +the same terminator is encountered, since the next closing sequence must close +the nested group, not the outer one. + +In Perl, unescaped square brackets may also appear as part of class names. For +example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for +[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not +seem right at all. PCRE does not allow closing square brackets in POSIX class +names. + +Arguments: + ptr pointer to the character after the initial [ (colon, dot, equals) + ptrend pointer to the end of the pattern + endptr where to return a pointer to the terminating ':', '.', or '=' + +Returns: TRUE or FALSE +*/ + +static BOOL +check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr) +{ +PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ +terminator = *ptr++; /* compiler warns about "non-constant" initializer. */ + +for (; ptrend - ptr >= 2; ptr++) + { + if (*ptr == CHAR_BACKSLASH && + (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) + ptr++; + + else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || + *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; + + else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + { + *endptr = ptr; + return TRUE; + } + } + +return FALSE; +} + + + +/************************************************* +* Check POSIX class name * +*************************************************/ + +/* This function is called to check the name given in a POSIX-style class entry +such as [:alnum:]. + +Arguments: + ptr points to the first letter + len the length of the name + +Returns: a value representing the name, or -1 if unknown +*/ + +static int +check_posix_name(PCRE2_SPTR ptr, int len) +{ +const char *pn = posix_names; +int yield = 0; +while (posix_name_lengths[yield] != 0) + { + if (len == posix_name_lengths[yield] && + PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; + pn += posix_name_lengths[yield] + 1; + yield++; + } +return -1; +} + + + +/************************************************* +* Read a subpattern or VERB name * +*************************************************/ + +/* This function is called from parse_regex() below whenever it needs to read +the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial +pointer must be to the character before the name. If that character is '*' we +are reading a verb or alpha assertion name. The pointer is updated to point +after the name, for a VERB or alpha assertion name, or after tha name's +terminator for a subpattern name. Returning both the offset and the name +pointer is redundant information, but some callers use one and some the other, +so it is simplest just to return both. + +Arguments: + ptrptr points to the character pointer variable + ptrend points to the end of the input string + utf true if the input is UTF-encoded + terminator the terminator of a subpattern name must be this + offsetptr where to put the offset from the start of the pattern + nameptr where to put a pointer to the name in the input + namelenptr where to put the length of the name + errcodeptr where to put an error code + cb pointer to the compile data block + +Returns: TRUE if a name was read + FALSE otherwise, with error code set +*/ + +static BOOL +read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator, + PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, + int *errorcodeptr, compile_block *cb) +{ +PCRE2_SPTR ptr = *ptrptr; +BOOL is_group = (*ptr != CHAR_ASTERISK); + +if (++ptr >= ptrend) /* No characters in name */ + { + *errorcodeptr = is_group? ERR62: /* Subpattern name expected */ + ERR60; /* Verb not recognized or malformed */ + goto FAILED; + } + +*nameptr = ptr; +*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); + +/* In UTF mode, a group name may contain letters and decimal digits as defined +by Unicode properties, and underscores, but must not start with a digit. */ + +#ifdef SUPPORT_UNICODE +if (utf && is_group) + { + uint32_t c, type; + + GETCHAR(c, ptr); + type = UCD_CHARTYPE(c); + + if (type == ucp_Nd) + { + *errorcodeptr = ERR44; + goto FAILED; + } + + for(;;) + { + if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L && + c != CHAR_UNDERSCORE) break; + ptr++; + FORWARDCHARTEST(ptr, ptrend); + if (ptr >= ptrend) break; + GETCHAR(c, ptr); + type = UCD_CHARTYPE(c); + } + } +else +#else +(void)utf; /* Avoid compiler warning */ +#endif /* SUPPORT_UNICODE */ + +/* Handle non-group names and group names in non-UTF modes. A group name must +not start with a digit. If either of the others start with a digit it just +won't be recognized. */ + + { + if (is_group && IS_DIGIT(*ptr)) + { + *errorcodeptr = ERR44; + goto FAILED; + } + + while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) + { + ptr++; + } + } + +/* Check name length */ + +if (ptr > *nameptr + MAX_NAME_SIZE) + { + *errorcodeptr = ERR48; + goto FAILED; + } +*namelenptr = (uint32_t)(ptr - *nameptr); + +/* Subpattern names must not be empty, and their terminator is checked here. +(What follows a verb or alpha assertion name is checked separately.) */ + +if (is_group) + { + if (ptr == *nameptr) + { + *errorcodeptr = ERR62; /* Subpattern name expected */ + goto FAILED; + } + if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator) + { + *errorcodeptr = ERR42; + goto FAILED; + } + ptr++; + } + +*ptrptr = ptr; +return TRUE; + +FAILED: +*ptrptr = ptr; +return FALSE; +} + + + +/************************************************* +* Manage callouts at start of cycle * +*************************************************/ + +/* At the start of a new item in parse_regex() we are able to record the +details of the previous item in a prior callout, and also to set up an +automatic callout if enabled. Avoid having two adjacent automatic callouts, +which would otherwise happen for items such as \Q that contribute nothing to +the parsed pattern. + +Arguments: + ptr current pattern pointer + pcalloutptr points to a pointer to previous callout, or NULL + auto_callout TRUE if auto_callouts are enabled + parsed_pattern the parsed pattern pointer + cb compile block + +Returns: possibly updated parsed_pattern pointer. +*/ + +static uint32_t * +manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout, + uint32_t *parsed_pattern, compile_block *cb) +{ +uint32_t *previous_callout = *pcalloutptr; + +if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr - + cb->start_pattern - (PCRE2_SIZE)previous_callout[1]); + +if (!auto_callout) previous_callout = NULL; else + { + if (previous_callout == NULL || + previous_callout != parsed_pattern - 4 || + previous_callout[3] != 255) + { + previous_callout = parsed_pattern; /* Set up new automatic callout */ + parsed_pattern += 4; + previous_callout[0] = META_CALLOUT_NUMBER; + previous_callout[2] = 0; + previous_callout[3] = 255; + } + previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); + } + +*pcalloutptr = previous_callout; +return parsed_pattern; +} + + + +/************************************************* +* Handle \d, \D, \s, \S, \w, \W * +*************************************************/ + +/* This function is called from parse_regex() below, both for freestanding +escapes, and those within classes, to handle those escapes that may change when +Unicode property support is requested. Note that PCRE2_UCP will never be set +without Unicode support because that is checked when pcre2_compile() is called. + +Arguments: + escape the ESC_... value + parsed_pattern where to add the code + options options bits + xoptions extra options bits + +Returns: updated value of parsed_pattern +*/ +static uint32_t * +handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options, + uint32_t xoptions) +{ +uint32_t ascii_option = 0; +uint32_t prop = ESC_p; + +switch(escape) + { + case ESC_D: + prop = ESC_P; + /* Fall through */ + case ESC_d: + ascii_option = PCRE2_EXTRA_ASCII_BSD; + break; + + case ESC_S: + prop = ESC_P; + /* Fall through */ + case ESC_s: + ascii_option = PCRE2_EXTRA_ASCII_BSS; + break; + + case ESC_W: + prop = ESC_P; + /* Fall through */ + case ESC_w: + ascii_option = PCRE2_EXTRA_ASCII_BSW; + break; + } + +if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0) + { + *parsed_pattern++ = META_ESCAPE + escape; + } +else + { + *parsed_pattern++ = META_ESCAPE + prop; + switch(escape) + { + case ESC_d: + case ESC_D: + *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; + break; + + case ESC_s: + case ESC_S: + *parsed_pattern++ = PT_SPACE << 16; + break; + + case ESC_w: + case ESC_W: + *parsed_pattern++ = PT_WORD << 16; + break; + } + } + +return parsed_pattern; +} + + + +/************************************************* +* Parse regex and identify named groups * +*************************************************/ + +/* This function is called first of all. It scans the pattern and does two +things: (1) It identifies capturing groups and makes a table of named capturing +groups so that information about them is fully available to both the compiling +scans. (2) It writes a parsed version of the pattern with comments omitted and +escapes processed into the parsed_pattern vector. + +Arguments: + ptr points to the start of the pattern + options compiling dynamic options (may change during the scan) + has_lookbehind points to a boolean, set TRUE if a lookbehind is found + cb pointer to the compile data block + +Returns: zero on success or a non-zero error code, with the + error offset placed in the cb field +*/ + +/* A structure and some flags for dealing with nested groups. */ + +typedef struct nest_save { + uint16_t nest_depth; + uint16_t reset_group; + uint16_t max_group; + uint16_t flags; + uint32_t options; + uint32_t xoptions; +} nest_save; + +#define NSF_RESET 0x0001u +#define NSF_CONDASSERT 0x0002u +#define NSF_ATOMICSR 0x0004u + +/* Options that are changeable within the pattern must be tracked during +parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing, +but all must be tracked so that META_OPTIONS items set the correct values for +the main compiling phase. */ + +#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \ + PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ + PCRE2_UNGREEDY) + +#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \ + PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \ + PCRE2_EXTRA_ASCII_POSIX) + +/* States used for analyzing ranges in character classes. The two OK values +must be last. */ + +enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; + +/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates +the storing of literal values in the main parsed pattern, where they can always +be quantified. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 +#define PARSED_LITERAL(c, p) \ + { \ + if (c >= META_END) *p++ = META_BIGVALUE; \ + *p++ = c; \ + okquantifier = TRUE; \ + } +#else +#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE; +#endif + +/* Here's the actual function. */ + +static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind, + compile_block *cb) +{ +uint32_t c; +uint32_t delimiter; +uint32_t namelen; +uint32_t class_range_state; +uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ +uint32_t *verbstartptr = NULL; +uint32_t *previous_callout = NULL; +uint32_t *parsed_pattern = cb->parsed_pattern; +uint32_t *parsed_pattern_end = cb->parsed_pattern_end; +uint32_t meta_quantifier = 0; +uint32_t add_after_mark = 0; +uint32_t xoptions = cb->cx->extra_options; +uint16_t nest_depth = 0; +int after_manual_callout = 0; +int expect_cond_assert = 0; +int errorcode = 0; +int escape; +int i; +BOOL inescq = FALSE; +BOOL inverbname = FALSE; +BOOL utf = (options & PCRE2_UTF) != 0; +BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0; +BOOL isdupname; +BOOL negate_class; +BOOL okquantifier = FALSE; +PCRE2_SPTR thisptr; +PCRE2_SPTR name; +PCRE2_SPTR ptrend = cb->end_pattern; +PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ +named_group *ng; +nest_save *top_nest, *end_nests; + +/* Insert leading items for word and line matching (features provided for the +benefit of pcre2grep). */ + +if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0) + { + *parsed_pattern++ = META_CIRCUMFLEX; + *parsed_pattern++ = META_NOCAPTURE; + } +else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0) + { + *parsed_pattern++ = META_ESCAPE + ESC_b; + *parsed_pattern++ = META_NOCAPTURE; + } + +/* If the pattern is actually a literal string, process it separately to avoid +cluttering up the main loop. */ + +if ((options & PCRE2_LITERAL) != 0) + { + while (ptr < ptrend) + { + if (parsed_pattern >= parsed_pattern_end) + { + errorcode = ERR63; /* Internal error (parsed pattern overflow) */ + goto FAILED; + } + thisptr = ptr; + GETCHARINCTEST(c, ptr); + if (auto_callout) + parsed_pattern = manage_callouts(thisptr, &previous_callout, + auto_callout, parsed_pattern, cb); + PARSED_LITERAL(c, parsed_pattern); + } + goto PARSED_END; + } + +/* Process a real regex which may contain meta-characters. */ + +top_nest = NULL; +end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); + +/* The size of the nest_save structure might not be a factor of the size of the +workspace. Therefore we must round down end_nests so as to correctly avoid +creating a nest_save that spans the end of the workspace. */ + +end_nests = (nest_save *)((char *)end_nests - + ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save))); + +/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */ + +if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED; + +/* Now scan the pattern */ + +while (ptr < ptrend) + { + int prev_expect_cond_assert; + uint32_t min_repeat = 0, max_repeat = 0; + uint32_t set, unset, *optset; + uint32_t xset, xunset, *xoptset; + uint32_t terminator; + uint32_t prev_meta_quantifier; + BOOL prev_okquantifier; + PCRE2_SPTR tempptr; + PCRE2_SIZE offset; + + if (parsed_pattern >= parsed_pattern_end) + { + errorcode = ERR63; /* Internal error (parsed pattern overflow) */ + goto FAILED; + } + + if (nest_depth > cb->cx->parens_nest_limit) + { + errorcode = ERR19; + goto FAILED; /* Parentheses too deeply nested */ + } + + /* Get next input character, save its position for callout handling. */ + + thisptr = ptr; + GETCHARINCTEST(c, ptr); + + /* Copy quoted literals until \E, allowing for the possibility of automatic + callouts, except when processing a (*VERB) "name". */ + + if (inescq) + { + if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) + { + inescq = FALSE; + ptr++; /* Skip E */ + } + else + { + if (expect_cond_assert > 0) /* A literal is not allowed if we are */ + { /* expecting a conditional assertion, */ + ptr--; /* but an empty \Q\E sequence is OK. */ + errorcode = ERR28; + goto FAILED; + } + if (inverbname) + { /* Don't use PARSED_LITERAL() because it */ +#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ + if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; +#endif + *parsed_pattern++ = c; + } + else + { + if (after_manual_callout-- <= 0) + parsed_pattern = manage_callouts(thisptr, &previous_callout, + auto_callout, parsed_pattern, cb); + PARSED_LITERAL(c, parsed_pattern); + } + meta_quantifier = 0; + } + continue; /* Next character */ + } + + /* If we are processing the "name" part of a (*VERB:NAME) item, all + characters up to the closing parenthesis are literals except when + PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q + and \E and escaped characters are allowed (no character types such as \d). If + PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do + this by not entering the special (*VERB:NAME) processing - they are then + picked up below. Note that c is a character, not a code unit, so we must not + use MAX_255 to test its size because MAX_255 tests code units and is assumed + TRUE in 8-bit mode. */ + + if (inverbname && + ( + /* EITHER: not both options set */ + ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) != + (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) || +#ifdef SUPPORT_UNICODE + /* OR: character > 255 AND not Unicode Pattern White Space */ + (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) || +#endif + /* OR: not a # comment or isspace() white space */ + (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0 +#ifdef SUPPORT_UNICODE + /* and not CHAR_NEL when Unicode is supported */ + && c != CHAR_NEL +#endif + ))) + { + PCRE2_SIZE verbnamelength; + + switch(c) + { + default: /* Don't use PARSED_LITERAL() because it */ +#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ + if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; +#endif + *parsed_pattern++ = c; + break; + + case CHAR_RIGHT_PARENTHESIS: + inverbname = FALSE; + /* This is the length in characters */ + verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); + /* But the limit on the length is in code units */ + if (ptr - verbnamestart - 1 > (int)MAX_MARK) + { + ptr--; + errorcode = ERR76; + goto FAILED; + } + *verblengthptr = (uint32_t)verbnamelength; + + /* If this name was on a verb such as (*ACCEPT) which does not continue, + a (*MARK) was generated for the name. We now add the original verb as the + next item. */ + + if (add_after_mark != 0) + { + *parsed_pattern++ = add_after_mark; + add_after_mark = 0; + } + break; + + case CHAR_BACKSLASH: + if ((options & PCRE2_ALT_VERBNAMES) != 0) + { + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + xoptions, FALSE, cb); + if (errorcode != 0) goto FAILED; + } + else escape = 0; /* Treat all as literal */ + + switch(escape) + { + case 0: /* Don't use PARSED_LITERAL() because it */ +#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */ + if (c >= META_END) *parsed_pattern++ = META_BIGVALUE; +#endif + *parsed_pattern++ = c; + break; + + case ESC_Q: + inescq = TRUE; + break; + + case ESC_E: /* Ignore */ + break; + + default: + errorcode = ERR40; /* Invalid in verb name */ + goto FAILED; + } + } + continue; /* Next character in pattern */ + } + + /* Not a verb name character. At this point we must process everything that + must not change the quantification state. This is mainly comments, but we + handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as + A+, as in Perl. An isolated \E is ignored. */ + + if (c == CHAR_BACKSLASH && ptr < ptrend) + { + if (*ptr == CHAR_Q || *ptr == CHAR_E) + { + inescq = *ptr == CHAR_Q; + ptr++; + continue; + } + } + + /* Skip over whitespace and # comments in extended mode. Note that c is a + character, not a code unit, so we must not use MAX_255 to test its size + because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The + whitespace characters are those designated as "Pattern White Space" by + Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is + U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a + subset of space characters that match \h and \v. */ + + if ((options & PCRE2_EXTENDED) != 0) + { + if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue; +#ifdef SUPPORT_UNICODE + if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue; +#endif + if (c == CHAR_NUMBER_SIGN) + { + while (ptr < ptrend) + { + if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ + { /* IS_NEWLINE sets cb->nllen. */ + ptr += cb->nllen; + break; + } + ptr++; +#ifdef SUPPORT_UNICODE + if (utf) FORWARDCHARTEST(ptr, ptrend); +#endif + } + continue; /* Next character in pattern */ + } + } + + /* Skip over bracketed comments */ + + if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 && + ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) + { + while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS); + if (ptr >= ptrend) + { + errorcode = ERR18; /* A special error for missing ) in a comment */ + goto FAILED; /* to make it easier to debug. */ + } + ptr++; + continue; /* Next character in pattern */ + } + + /* If the next item is not a quantifier, fill in length of any previous + callout and create an auto callout if required. */ + + if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK && + (c != CHAR_LEFT_CURLY_BRACKET || + (tempptr = ptr, + !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode)))) + { + if (after_manual_callout-- <= 0) + parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout, + parsed_pattern, cb); + } + + /* If expect_cond_assert is 2, we have just passed (?( and are expecting an + assertion, possibly preceded by a callout. If the value is 1, we have just + had the callout and expect an assertion. There must be at least 3 more + characters in all cases. When expect_cond_assert is 2, we know that the + current character is an opening parenthesis, as otherwise we wouldn't be + here. However, when it is 1, we need to check, and it's easiest just to check + always. Note that expect_cond_assert may be negative, since all callouts just + decrement it. */ + + if (expect_cond_assert > 0) + { + BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 && + (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK); + if (ok) + { + if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */ + { + ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0; + } + else switch(ptr[1]) /* Traditional symbolic format */ + { + case CHAR_C: + ok = expect_cond_assert == 2; + break; + + case CHAR_EQUALS_SIGN: + case CHAR_EXCLAMATION_MARK: + break; + + case CHAR_LESS_THAN_SIGN: + ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK; + break; + + default: + ok = FALSE; + } + } + + if (!ok) + { + ptr--; /* Adjust error offset */ + errorcode = ERR28; + goto FAILED; + } + } + + /* Remember whether we are expecting a conditional assertion, and set the + default for this item. */ + + prev_expect_cond_assert = expect_cond_assert; + expect_cond_assert = 0; + + /* Remember quantification status for the previous significant item, then set + default for this item. */ + + prev_okquantifier = okquantifier; + prev_meta_quantifier = meta_quantifier; + okquantifier = FALSE; + meta_quantifier = 0; + + /* If the previous significant item was a quantifier, adjust the parsed code + if there is a following modifier. The base meta value is always followed by + the PLUS and QUERY values, in that order. We do this here rather than after + reading a quantifier so that intervening comments and /x whitespace can be + ignored without having to replicate code. */ + + if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS)) + { + parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] = + prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)? + 0x00020000u : 0x00010000u); + continue; /* Next character in pattern */ + } + + + /* Process the next item in the main part of a pattern. */ + + switch(c) + { + default: /* Non-special character */ + PARSED_LITERAL(c, parsed_pattern); + break; + + + /* ---- Escape sequence ---- */ + + case CHAR_BACKSLASH: + tempptr = ptr; + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + xoptions, FALSE, cb); + if (errorcode != 0) + { + ESCAPE_FAILED: + if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) + goto FAILED; + ptr = tempptr; + if (ptr >= ptrend) c = CHAR_BACKSLASH; else + { + GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ + } + escape = 0; /* Treat as literal character */ + } + + /* The escape was a data escape or literal character. */ + + if (escape == 0) + { + PARSED_LITERAL(c, parsed_pattern); + } + + /* The escape was a back (or forward) reference. We keep the offset in + order to give a more useful diagnostic for a bad forward reference. For + references to groups numbered less than 10 we can't use more than two items + in parsed_pattern because they may be just two characters in the input (and + in a 64-bit world an offset may need two elements). So for them, the offset + of the first occurrent is held in a special vector. */ + + else if (escape < 0) + { + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1); + escape = -escape; + *parsed_pattern++ = META_BACKREF | (uint32_t)escape; + if (escape < 10) + { + if (cb->small_ref_offset[escape] == PCRE2_UNSET) + cb->small_ref_offset[escape] = offset; + } + else + { + PUTOFFSET(offset, parsed_pattern); + } + okquantifier = TRUE; + } + + /* The escape was a character class such as \d etc. or other special + escape indicator such as \A or \X. Most of them generate just a single + parsed item, but \P and \p are followed by a 16-bit type and a 16-bit + value. They are supported only when Unicode is available. The type and + value are packed into a single 32-bit value so that the whole sequences + uses only two elements in the parsed_vector. This is because the same + coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is + set. + + There are also some cases where the escape sequence is followed by a name: + \k{name}, \k, and \k'name' are backreferences by name, and \g + and \g'name' are subroutine calls by name; \g{name} is a synonym for + \k{name}. Note that \g and \g'number' are handled by check_escape() + and returned as a negative value (handled above). A name is coded as an + offset into the pattern and a length. */ + + else switch (escape) + { + case ESC_C: +#ifdef NEVER_BACKSLASH_C + errorcode = ERR85; + goto ESCAPE_FAILED; +#else + if ((options & PCRE2_NEVER_BACKSLASH_C) != 0) + { + errorcode = ERR83; + goto ESCAPE_FAILED; + } +#endif + okquantifier = TRUE; + *parsed_pattern++ = META_ESCAPE + escape; + break; + + case ESC_X: +#ifndef SUPPORT_UNICODE + errorcode = ERR45; /* Supported only with Unicode support */ + goto ESCAPE_FAILED; +#endif + case ESC_H: + case ESC_h: + case ESC_N: + case ESC_R: + case ESC_V: + case ESC_v: + okquantifier = TRUE; + *parsed_pattern++ = META_ESCAPE + escape; + break; + + default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */ + *parsed_pattern++ = META_ESCAPE + escape; + break; + + /* Escapes that may change in UCP mode. */ + + case ESC_d: + case ESC_D: + case ESC_s: + case ESC_S: + case ESC_w: + case ESC_W: + okquantifier = TRUE; + parsed_pattern = handle_escdsw(escape, parsed_pattern, options, + xoptions); + break; + + /* Unicode property matching */ + + case ESC_P: + case ESC_p: +#ifdef SUPPORT_UNICODE + { + BOOL negated; + uint16_t ptype = 0, pdata = 0; + if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) + goto ESCAPE_FAILED; + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; + *parsed_pattern++ = META_ESCAPE + escape; + *parsed_pattern++ = (ptype << 16) | pdata; + okquantifier = TRUE; + } +#else + errorcode = ERR45; + goto ESCAPE_FAILED; +#endif + break; /* End \P and \p */ + + /* When \g is used with quotes or angle brackets as delimiters, it is a + numerical or named subroutine call, and control comes here. When used + with brace delimiters it is a numberical back reference and does not come + here because check_escape() returns it directly as a reference. \k is + always a named back reference. */ + + case ESC_g: + case ESC_k: + if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET && + *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE)) + { + errorcode = (escape == ESC_g)? ERR57 : ERR69; + goto ESCAPE_FAILED; + } + terminator = (*ptr == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? + CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; + + /* For a non-braced \g, check for a numerical recursion. */ + + if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET) + { + PCRE2_SPTR p = ptr + 1; + + if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, + &errorcode)) + { + if (p >= ptrend || *p != terminator) + { + errorcode = ERR57; + goto ESCAPE_FAILED; + } + ptr = p; + goto SET_RECURSION; + } + if (errorcode != 0) goto ESCAPE_FAILED; + } + + /* Not a numerical recursion */ + + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, + &errorcode, cb)) goto ESCAPE_FAILED; + + /* \k and \g when used with braces are back references, whereas \g used + with quotes or angle brackets is a recursion */ + + *parsed_pattern++ = + (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)? + META_BACKREF_BYNAME : META_RECURSE_BYNAME; + *parsed_pattern++ = namelen; + + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; /* End special escape processing */ + } + break; /* End escape sequence processing */ + + + /* ---- Single-character special items ---- */ + + case CHAR_CIRCUMFLEX_ACCENT: + *parsed_pattern++ = META_CIRCUMFLEX; + break; + + case CHAR_DOLLAR_SIGN: + *parsed_pattern++ = META_DOLLAR; + break; + + case CHAR_DOT: + *parsed_pattern++ = META_DOT; + okquantifier = TRUE; + break; + + + /* ---- Single-character quantifiers ---- */ + + case CHAR_ASTERISK: + meta_quantifier = META_ASTERISK; + goto CHECK_QUANTIFIER; + + case CHAR_PLUS: + meta_quantifier = META_PLUS; + goto CHECK_QUANTIFIER; + + case CHAR_QUESTION_MARK: + meta_quantifier = META_QUERY; + goto CHECK_QUANTIFIER; + + + /* ---- Potential {n,m} quantifier ---- */ + + case CHAR_LEFT_CURLY_BRACKET: + if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat, + &errorcode)) + { + if (errorcode != 0) goto FAILED; /* Error in quantifier. */ + PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */ + break; /* No more quantifier processing */ + } + meta_quantifier = META_MINMAX; + /* Fall through */ + + + /* ---- Quantifier post-processing ---- */ + + /* Check that a quantifier is allowed after the previous item. */ + + CHECK_QUANTIFIER: + if (!prev_okquantifier) + { + errorcode = ERR9; + goto FAILED_BACK; + } + + /* Most (*VERB)s are not allowed to be quantified, but an ungreedy + quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a + sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by + wrapping it in non-capturing brackets, but we have to allow for a preceding + (*MARK) for when (*ACCEPT) has an argument. */ + + if (parsed_pattern[-1] == META_ACCEPT) + { + uint32_t *p; + for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0]; + *verbstartptr = META_NOCAPTURE; + parsed_pattern[1] = META_KET; + parsed_pattern += 2; + } + + /* Now we can put the quantifier into the parsed pattern vector. At this + stage, we have only the basic quantifier. The check for a following + or ? + modifier happens at the top of the loop, after any intervening comments + have been removed. */ + + *parsed_pattern++ = meta_quantifier; + if (c == CHAR_LEFT_CURLY_BRACKET) + { + *parsed_pattern++ = min_repeat; + *parsed_pattern++ = max_repeat; + } + break; + + + /* ---- Character class ---- */ + + case CHAR_LEFT_SQUARE_BRACKET: + okquantifier = TRUE; + + /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is + used for "start of word" and "end of word". As these are otherwise illegal + sequences, we don't break anything by recognizing them. They are replaced + by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are + erroneous and are handled by the normal code below. */ + + if (ptrend - ptr >= 6 && + (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 || + PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0)) + { + *parsed_pattern++ = META_ESCAPE + ESC_b; + + if (ptr[2] == CHAR_LESS_THAN_SIGN) + { + *parsed_pattern++ = META_LOOKAHEAD; + } + else + { + *parsed_pattern++ = META_LOOKBEHIND; + *has_lookbehind = TRUE; + + /* The offset is used only for the "non-fixed length" error; this won't + occur here, so just store zero. */ + + PUTOFFSET((PCRE2_SIZE)0, parsed_pattern); + } + + if ((options & PCRE2_UCP) == 0) + *parsed_pattern++ = META_ESCAPE + ESC_w; + else + { + *parsed_pattern++ = META_ESCAPE + ESC_p; + *parsed_pattern++ = PT_WORD << 16; + } + *parsed_pattern++ = META_KET; + ptr += 6; + break; + } + + /* PCRE supports POSIX class stuff inside a class. Perl gives an error if + they are encountered at the top level, so we'll do that too. */ + + if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT || + *ptr == CHAR_EQUALS_SIGN) && + check_posix_syntax(ptr, ptrend, &tempptr)) + { + errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13; + goto FAILED; + } + + /* Process a regular character class. If the first character is '^', set + the negation flag. If the first few characters (either before or after ^) + are \Q\E or \E or space or tab in extended-more mode, we skip them too. + This makes for compatibility with Perl. */ + + negate_class = FALSE; + while (ptr < ptrend) + { + GETCHARINCTEST(c, ptr); + if (c == CHAR_BACKSLASH) + { + if (ptr < ptrend && *ptr == CHAR_E) ptr++; + else if (ptrend - ptr >= 3 && + PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0) + ptr += 3; + else + break; + } + else if ((options & PCRE2_EXTENDED_MORE) != 0 && + (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */ + continue; + else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) + negate_class = TRUE; + else break; + } + + /* Now the real contents of the class; c has the first "real" character. + Empty classes are permitted only if the option is set. */ + + if (c == CHAR_RIGHT_SQUARE_BRACKET && + (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) + { + *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY; + break; /* End of class processing */ + } + + /* Process a non-empty class. */ + + *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS; + class_range_state = RANGE_NO; + + /* In an EBCDIC environment, Perl treats alphabetic ranges specially + because there are holes in the encoding, and simply using the range A-Z + (for example) would include the characters in the holes. This applies only + to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z] + in this respect. In order to accommodate this, we keep track of whether + character values are literal or not, and a state variable for handling + ranges. */ + + /* Loop for the contents of the class */ + + for (;;) + { + BOOL char_is_literal = TRUE; + + /* Inside \Q...\E everything is literal except \E */ + + if (inescq) + { + if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) + { + inescq = FALSE; /* Reset literal state */ + ptr++; /* Skip the 'E' */ + goto CLASS_CONTINUE; + } + goto CLASS_LITERAL; + } + + /* Skip over space and tab (only) in extended-more mode. */ + + if ((options & PCRE2_EXTENDED_MORE) != 0 && + (c == CHAR_SPACE || c == CHAR_HT)) + goto CLASS_CONTINUE; + + /* Handle POSIX class names. Perl allows a negation extension of the + form [:^name:]. A square bracket that doesn't match the syntax is + treated as a literal. We also recognize the POSIX constructions + [.ch.] and [=ch=] ("collating elements") and fault them, as Perl + 5.6 and 5.8 do. */ + + if (c == CHAR_LEFT_SQUARE_BRACKET && + ptrend - ptr >= 3 && + (*ptr == CHAR_COLON || *ptr == CHAR_DOT || + *ptr == CHAR_EQUALS_SIGN) && + check_posix_syntax(ptr, ptrend, &tempptr)) + { + BOOL posix_negate = FALSE; + int posix_class; + + /* Perl treats a hyphen before a POSIX class as a literal, not the + start of a range. However, it gives a warning in its warning mode. PCRE + does not have a warning mode, so we give an error, because this is + likely an error on the user's part. */ + + if (class_range_state == RANGE_STARTED) + { + errorcode = ERR50; + goto FAILED; + } + + if (*ptr != CHAR_COLON) + { + errorcode = ERR13; + goto FAILED_BACK; + } + + if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) + { + posix_negate = TRUE; + ptr++; + } + + posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); + if (posix_class < 0) + { + errorcode = ERR30; + goto FAILED; + } + ptr = tempptr + 2; + + /* Perl treats a hyphen after a POSIX class as a literal, not the + start of a range. However, it gives a warning in its warning mode + unless the hyphen is the last character in the class. PCRE does not + have a warning mode, so we give an error, because this is likely an + error on the user's part. */ + + if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && + ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) + { + errorcode = ERR50; + goto FAILED; + } + + /* Set "a hyphen is not the start of a range" for the -] case, and also + in case the POSIX class is followed by \E or \Q\E (possibly repeated - + fuzzers do that kind of thing) and *then* a hyphen. This causes that + hyphen to be treated as a literal. I don't think it's worth setting up + special apparatus to do otherwise. */ + + class_range_state = RANGE_NO; + + /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some + of the POSIX classes are converted to use Unicode properties \p or \P + or, in one case, \h or \H. The substitutes table has two values per + class, containing the type and value of a \p or \P item. The special + cases are specified with a negative type: a non-zero value causes \h or + \H to be used, and a zero value falls through to behave like a non-UCP + POSIX class. There are now also some extra options that force ASCII for + some classes. */ + +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UCP) != 0 && + (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) + { + int ptype = posix_substitutes[2*posix_class]; + int pvalue = posix_substitutes[2*posix_class + 1]; + + if (ptype >= 0) + { + *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p); + *parsed_pattern++ = (ptype << 16) | pvalue; + goto CLASS_CONTINUE; + } + + if (pvalue != 0) + { + *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h); + goto CLASS_CONTINUE; + } + + /* Fall through */ + } +#endif /* SUPPORT_UNICODE */ + + /* Non-UCP POSIX class */ + + *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX; + *parsed_pattern++ = posix_class; + } + + /* Handle potential start of range */ + + else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED) + { + *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)? + META_RANGE_LITERAL : META_RANGE_ESCAPED; + class_range_state = RANGE_STARTED; + } + + /* Handle a literal character */ + + else if (c != CHAR_BACKSLASH) + { + CLASS_LITERAL: + if (class_range_state == RANGE_STARTED) + { + if (c == parsed_pattern[-2]) /* Optimize one-char range */ + parsed_pattern--; + else if (parsed_pattern[-2] > c) /* Check range is in order */ + { + errorcode = ERR8; + goto FAILED_BACK; + } + else + { + if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL) + parsed_pattern[-1] = META_RANGE_ESCAPED; + PARSED_LITERAL(c, parsed_pattern); + } + class_range_state = RANGE_NO; + } + else /* Potential start of range */ + { + class_range_state = char_is_literal? + RANGE_OK_LITERAL : RANGE_OK_ESCAPED; + PARSED_LITERAL(c, parsed_pattern); + } + } + + /* Handle escapes in a class */ + + else + { + tempptr = ptr; + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + xoptions, TRUE, cb); + + if (errorcode != 0) + { + if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) + goto FAILED; + ptr = tempptr; + if (ptr >= ptrend) c = CHAR_BACKSLASH; else + { + GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ + } + escape = 0; /* Treat as literal character */ + } + + switch(escape) + { + case 0: /* Escaped character code point is in c */ + char_is_literal = FALSE; + goto CLASS_LITERAL; + + case ESC_b: + c = CHAR_BS; /* \b is backspace in a class */ + char_is_literal = FALSE; + goto CLASS_LITERAL; + + case ESC_Q: + inescq = TRUE; /* Enter literal mode */ + goto CLASS_CONTINUE; + + case ESC_E: /* Ignore orphan \E */ + goto CLASS_CONTINUE; + + case ESC_B: /* Always an error in a class */ + case ESC_R: + case ESC_X: + errorcode = ERR7; + ptr--; + goto FAILED; + } + + /* The second part of a range can be a single-character escape + sequence (detected above), but not any of the other escapes. Perl + treats a hyphen as a literal in such circumstances. However, in Perl's + warning mode, a warning is given, so PCRE now faults it, as it is + almost certainly a mistake on the user's part. */ + + if (class_range_state == RANGE_STARTED) + { + errorcode = ERR50; + goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */ + } + + /* Of the remaining escapes, only those that define characters are + allowed in a class. None may start a range. */ + + class_range_state = RANGE_NO; + switch(escape) + { + case ESC_N: + errorcode = ERR71; + goto FAILED; + + case ESC_H: + case ESC_h: + case ESC_V: + case ESC_v: + *parsed_pattern++ = META_ESCAPE + escape; + break; + + /* These escapes may be converted to Unicode property tests when + PCRE2_UCP is set. */ + + case ESC_d: + case ESC_D: + case ESC_s: + case ESC_S: + case ESC_w: + case ESC_W: + parsed_pattern = handle_escdsw(escape, parsed_pattern, options, + xoptions); + break; + + /* Explicit Unicode property matching */ + + case ESC_P: + case ESC_p: +#ifdef SUPPORT_UNICODE + { + BOOL negated; + uint16_t ptype = 0, pdata = 0; + if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) + goto FAILED; + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; + *parsed_pattern++ = META_ESCAPE + escape; + *parsed_pattern++ = (ptype << 16) | pdata; + } +#else + errorcode = ERR45; + goto FAILED; +#endif + break; /* End \P and \p */ + + default: /* All others are not allowed in a class */ + errorcode = ERR7; + ptr--; + goto FAILED; + } + + /* Perl gives a warning unless a following hyphen is the last character + in the class. PCRE throws an error. */ + + if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && + ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) + { + errorcode = ERR50; + goto FAILED; + } + } + + /* Proceed to next thing in the class. */ + + CLASS_CONTINUE: + if (ptr >= ptrend) + { + errorcode = ERR6; /* Missing terminating ']' */ + goto FAILED; + } + GETCHARINCTEST(c, ptr); + if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; + } /* End of class-processing loop */ + + /* -] at the end of a class is a literal '-' */ + + if (class_range_state == RANGE_STARTED) + { + parsed_pattern[-1] = CHAR_MINUS; + class_range_state = RANGE_NO; + } + + *parsed_pattern++ = META_CLASS_END; + break; /* End of character class */ + + + /* ---- Opening parenthesis ---- */ + + case CHAR_LEFT_PARENTHESIS: + if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* If ( is not followed by ? it is either a capture or a special verb or an + alpha assertion or a positive non-atomic lookahead. */ + + if (*ptr != CHAR_QUESTION_MARK) + { + const char *vn; + + /* Handle capturing brackets (or non-capturing if auto-capture is turned + off). */ + + if (*ptr != CHAR_ASTERISK) + { + nest_depth++; + if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) + { + if (cb->bracount >= MAX_GROUP_NUMBER) + { + errorcode = ERR97; + goto FAILED; + } + cb->bracount++; + *parsed_pattern++ = META_CAPTURE | cb->bracount; + } + else *parsed_pattern++ = META_NOCAPTURE; + } + + /* Do nothing for (* followed by end of pattern or ) so it gives a "bad + quantifier" error rather than "(*MARK) must have an argument". */ + + else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS) + break; + + /* Handle "alpha assertions" such as (*pla:...). Most of these are + synonyms for the historical symbolic assertions, but the script run and + non-atomic lookaround ones are new. They are distinguished by starting + with a lower case letter. Checking both ends of the alphabet makes this + work in all character codes. */ + + else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0) + { + uint32_t meta; + + vn = alasnames; + if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + if (ptr >= ptrend || *ptr != CHAR_COLON) + { + errorcode = ERR95; /* Malformed */ + goto FAILED; + } + + /* Scan the table of alpha assertion names */ + + for (i = 0; i < alascount; i++) + { + if (namelen == alasmeta[i].len && + PRIV(strncmp_c8)(name, vn, namelen) == 0) + break; + vn += alasmeta[i].len + 1; + } + + if (i >= alascount) + { + errorcode = ERR95; /* Alpha assertion not recognized */ + goto FAILED; + } + + /* Check for expecting an assertion condition. If so, only atomic + lookaround assertions are valid. */ + + meta = alasmeta[i].meta; + if (prev_expect_cond_assert > 0 && + (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT)) + { + errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)? + ERR98 : ERR28; /* (Atomic) assertion expected */ + goto FAILED; + } + + /* The lookaround alphabetic synonyms can mostly be handled by jumping + to the code that handles the traditional symbolic forms. */ + + switch(meta) + { + default: + errorcode = ERR89; /* Unknown code; should never occur because */ + goto FAILED; /* the meta values come from a table above. */ + + case META_ATOMIC: + goto ATOMIC_GROUP; + + case META_LOOKAHEAD: + goto POSITIVE_LOOK_AHEAD; + + case META_LOOKAHEAD_NA: + goto POSITIVE_NONATOMIC_LOOK_AHEAD; + + case META_LOOKAHEADNOT: + goto NEGATIVE_LOOK_AHEAD; + + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + case META_LOOKBEHIND_NA: + *parsed_pattern++ = meta; + ptr--; + goto POST_LOOKBEHIND; + + /* The script run facilities are handled here. Unicode support is + required (give an error if not, as this is a security issue). Always + record a META_SCRIPT_RUN item. Then, for the atomic version, insert + META_ATOMIC and remember that we need two META_KETs at the end. */ + + case META_SCRIPT_RUN: + case META_ATOMIC_SCRIPT_RUN: +#ifdef SUPPORT_UNICODE + *parsed_pattern++ = META_SCRIPT_RUN; + nest_depth++; + ptr++; + if (meta == META_ATOMIC_SCRIPT_RUN) + { + *parsed_pattern++ = META_ATOMIC; + if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); + else if (++top_nest >= end_nests) + { + errorcode = ERR84; + goto FAILED; + } + top_nest->nest_depth = nest_depth; + top_nest->flags = NSF_ATOMICSR; + top_nest->options = options & PARSE_TRACKED_OPTIONS; + top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS; + } + break; +#else /* SUPPORT_UNICODE */ + errorcode = ERR96; + goto FAILED; +#endif + } + } + + + /* ---- Handle (*VERB) and (*VERB:NAME) ---- */ + + else + { + vn = verbnames; + if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + if (ptr >= ptrend || (*ptr != CHAR_COLON && + *ptr != CHAR_RIGHT_PARENTHESIS)) + { + errorcode = ERR60; /* Malformed */ + goto FAILED; + } + + /* Scan the table of verb names */ + + for (i = 0; i < verbcount; i++) + { + if (namelen == verbs[i].len && + PRIV(strncmp_c8)(name, vn, namelen) == 0) + break; + vn += verbs[i].len + 1; + } + + if (i >= verbcount) + { + errorcode = ERR60; /* Verb not recognized */ + goto FAILED; + } + + /* An empty argument is treated as no argument. */ + + if (*ptr == CHAR_COLON && ptr + 1 < ptrend && + ptr[1] == CHAR_RIGHT_PARENTHESIS) + ptr++; /* Advance to the closing parens */ + + /* Check for mandatory non-empty argument; this is (*MARK) */ + + if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON) + { + errorcode = ERR66; + goto FAILED; + } + + /* Remember where this verb, possibly with a preceding (*MARK), starts, + for handling quantified (*ACCEPT). */ + + verbstartptr = parsed_pattern; + okquantifier = (verbs[i].meta == META_ACCEPT); + + /* It appears that Perl allows any characters whatsoever, other than a + closing parenthesis, to appear in arguments ("names"), so we no longer + insist on letters, digits, and underscores. Perl does not, however, do + any interpretation within arguments, and has no means of including a + closing parenthesis. PCRE supports escape processing but only when it + is requested by an option. We set inverbname TRUE here, and let the + main loop take care of this so that escape and \x processing is done by + the main code above. */ + + if (*ptr++ == CHAR_COLON) /* Skip past : or ) */ + { + /* Some optional arguments can be treated as a preceding (*MARK) */ + + if (verbs[i].has_arg < 0) + { + add_after_mark = verbs[i].meta; + *parsed_pattern++ = META_MARK; + } + + /* The remaining verbs with arguments (except *MARK) need a different + opcode. */ + + else + { + *parsed_pattern++ = verbs[i].meta + + ((verbs[i].meta != META_MARK)? 0x00010000u:0); + } + + /* Set up for reading the name in the main loop. */ + + verblengthptr = parsed_pattern++; + verbnamestart = ptr; + inverbname = TRUE; + } + else /* No verb "name" argument */ + { + *parsed_pattern++ = verbs[i].meta; + } + } /* End of (*VERB) handling */ + break; /* Done with this parenthesis */ + } /* End of groups that don't start with (? */ + + + /* ---- Items starting (? ---- */ + + /* The type of item is determined by what follows (?. Handle (?| and option + changes under "default" because both need a new block on the nest stack. + Comments starting with (?# are handled above. Note that there is some + ambiguity about the sequence (?- because if a digit follows it's a relative + recursion or subroutine call whereas otherwise it's an option unsetting. */ + + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + switch(*ptr) + { + default: + if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1])) + goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */ + + /* We now have either (?| or a (possibly empty) option setting, + optionally followed by a non-capturing group. */ + + nest_depth++; + if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); + else if (++top_nest >= end_nests) + { + errorcode = ERR84; + goto FAILED; + } + top_nest->nest_depth = nest_depth; + top_nest->flags = 0; + top_nest->options = options & PARSE_TRACKED_OPTIONS; + top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS; + + /* Start of non-capturing group that resets the capture count for each + branch. */ + + if (*ptr == CHAR_VERTICAL_LINE) + { + top_nest->reset_group = (uint16_t)cb->bracount; + top_nest->max_group = (uint16_t)cb->bracount; + top_nest->flags |= NSF_RESET; + cb->external_flags |= PCRE2_DUPCAPUSED; + *parsed_pattern++ = META_NOCAPTURE; + ptr++; + } + + /* Scan for options imnrsxJU to be set or unset. */ + + else + { + BOOL hyphenok = TRUE; + uint32_t oldoptions = options; + uint32_t oldxoptions = xoptions; + + top_nest->reset_group = 0; + top_nest->max_group = 0; + set = unset = 0; + optset = &set; + xset = xunset = 0; + xoptset = &xset; + + /* ^ at the start unsets irmnsx and disables the subsequent use of - */ + + if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT) + { + options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| + PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE); + xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT); + hyphenok = FALSE; + ptr++; + } + + while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS && + *ptr != CHAR_COLON) + { + switch (*ptr++) + { + case CHAR_MINUS: + if (!hyphenok) + { + errorcode = ERR94; + ptr--; /* Correct the offset */ + goto FAILED; + } + optset = &unset; + xoptset = &xunset; + hyphenok = FALSE; + break; + + /* There are some two-character sequences that start with 'a'. */ + + case CHAR_a: + if (ptr < ptrend) + { + if (*ptr == CHAR_D) + { + *xoptset |= PCRE2_EXTRA_ASCII_BSD; + ptr++; + break; + } + if (*ptr == CHAR_P) + { + *xoptset |= PCRE2_EXTRA_ASCII_POSIX; + ptr++; + break; + } + if (*ptr == CHAR_S) + { + *xoptset |= PCRE2_EXTRA_ASCII_BSS; + ptr++; + break; + } + if (*ptr == CHAR_W) + { + *xoptset |= PCRE2_EXTRA_ASCII_BSW; + ptr++; + break; + } + } + *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS| + PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX; + break; + + case CHAR_J: /* Record that it changed in the external options */ + *optset |= PCRE2_DUPNAMES; + cb->external_flags |= PCRE2_JCHANGED; + break; + + case CHAR_i: *optset |= PCRE2_CASELESS; break; + case CHAR_m: *optset |= PCRE2_MULTILINE; break; + case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break; + case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break; + case CHAR_s: *optset |= PCRE2_DOTALL; break; + case CHAR_U: *optset |= PCRE2_UNGREEDY; break; + + /* If x appears twice it sets the extended extended option. */ + + case CHAR_x: + *optset |= PCRE2_EXTENDED; + if (ptr < ptrend && *ptr == CHAR_x) + { + *optset |= PCRE2_EXTENDED_MORE; + ptr++; + } + break; + + default: + errorcode = ERR11; + ptr--; /* Correct the offset */ + goto FAILED; + } + } + + /* If we are setting extended without extended-more, ensure that any + existing extended-more gets unset. Also, unsetting extended must also + unset extended-more. */ + + if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED || + (unset & PCRE2_EXTENDED) != 0) + unset |= PCRE2_EXTENDED_MORE; + + options = (options | set) & (~unset); + xoptions = (xoptions | xset) & (~xunset); + + /* If the options ended with ')' this is not the start of a nested + group with option changes, so the options change at this level. + In this case, if the previous level set up a nest block, discard the + one we have just created. Otherwise adjust it for the previous level. + If the options ended with ':' we are starting a non-capturing group, + possibly with an options setting. */ + + if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + if (*ptr++ == CHAR_RIGHT_PARENTHESIS) + { + nest_depth--; /* This is not a nested group after all. */ + if (top_nest > (nest_save *)(cb->start_workspace) && + (top_nest-1)->nest_depth == nest_depth) top_nest--; + else top_nest->nest_depth = nest_depth; + } + else *parsed_pattern++ = META_NOCAPTURE; + + /* If nothing changed, no need to record. */ + + if (options != oldoptions || xoptions != oldxoptions) + { + *parsed_pattern++ = META_OPTIONS; + *parsed_pattern++ = options; + *parsed_pattern++ = xoptions; + } + } /* End options processing */ + break; /* End default case after (? */ + + + /* ---- Python syntax support ---- */ + + case CHAR_P: + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* (?P is the same as (?, which defines a named group. */ + + if (*ptr == CHAR_LESS_THAN_SIGN) + { + terminator = CHAR_GREATER_THAN_SIGN; + goto DEFINE_NAME; + } + + /* (?P>name) is the same as (?&name), which is a recursion or subroutine + call. */ + + if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME; + + /* (?P=name) is the same as \k, a back reference by name. Anything + else after (?P is an error. */ + + if (*ptr != CHAR_EQUALS_SIGN) + { + errorcode = ERR41; + goto FAILED; + } + if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, + &namelen, &errorcode, cb)) goto FAILED; + *parsed_pattern++ = META_BACKREF_BYNAME; + *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; /* End of (?P processing */ + + + /* ---- Recursion/subroutine calls by number ---- */ + + case CHAR_R: + i = 0; /* (?R) == (?R0) */ + ptr++; + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR58; + goto FAILED; + } + goto SET_RECURSION; + + /* An item starting (?- followed by a digit comes here via the "default" + case because (?- followed by a non-digit is an options setting. */ + + case CHAR_PLUS: + if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1])) + { + errorcode = ERR29; /* Missing number */ + goto FAILED; + } + /* Fall through */ + + case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: + case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: + RECURSION_BYNUMBER: + if (!read_number(&ptr, ptrend, + (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */ + MAX_GROUP_NUMBER, ERR61, + &i, &errorcode)) goto FAILED; + if (i < 0) /* NB (?0) is permitted */ + { + errorcode = ERR15; /* Unknown group */ + goto FAILED_BACK; + } + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + goto UNCLOSED_PARENTHESIS; + + SET_RECURSION: + *parsed_pattern++ = META_RECURSE | (uint32_t)i; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern); + ptr++; + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; /* End of recursive call by number handling */ + + + /* ---- Recursion/subroutine calls by name ---- */ + + case CHAR_AMPERSAND: + RECURSE_BY_NAME: + if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, + &namelen, &errorcode, cb)) goto FAILED; + *parsed_pattern++ = META_RECURSE_BYNAME; + *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; + + /* ---- Callout with numerical or string argument ---- */ + + case CHAR_C: + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* If the previous item was a condition starting (?(? an assertion, + optionally preceded by a callout, is expected. This is checked later on, + during actual compilation. However we need to identify this kind of + assertion in this pass because it must not be qualified. The value of + expect_cond_assert is set to 2 after (?(? is processed. We decrement it + for a callout - still leaving a positive value that identifies the + assertion. Multiple callouts or any other items will make it zero or + less, which doesn't matter because they will cause an error later. */ + + expect_cond_assert = prev_expect_cond_assert - 1; + + /* If previous_callout is not NULL, it means this follows a previous + callout. If it was a manual callout, do nothing; this means its "length + of next pattern item" field will remain zero. If it was an automatic + callout, abolish it. */ + + if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 && + previous_callout == parsed_pattern - 4 && + parsed_pattern[-1] == 255) + parsed_pattern = previous_callout; + + /* Save for updating next pattern item length, and skip one item before + completing. */ + + previous_callout = parsed_pattern; + after_manual_callout = 1; + + /* Handle a string argument; specific delimiter is required. */ + + if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) + { + PCRE2_SIZE calloutlength; + PCRE2_SPTR startptr = ptr; + + delimiter = 0; + for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) + { + if (*ptr == PRIV(callout_start_delims)[i]) + { + delimiter = PRIV(callout_end_delims)[i]; + break; + } + } + if (delimiter == 0) + { + errorcode = ERR82; + goto FAILED; + } + + *parsed_pattern = META_CALLOUT_STRING; + parsed_pattern += 3; /* Skip pattern info */ + + for (;;) + { + if (++ptr >= ptrend) + { + errorcode = ERR81; + ptr = startptr; /* To give a more useful message */ + goto FAILED; + } + if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter)) + break; + } + + calloutlength = (PCRE2_SIZE)(ptr - startptr); + if (calloutlength > UINT32_MAX) + { + errorcode = ERR72; + goto FAILED; + } + *parsed_pattern++ = (uint32_t)calloutlength; + offset = (PCRE2_SIZE)(startptr - cb->start_pattern); + PUTOFFSET(offset, parsed_pattern); + } + + /* Handle a callout with an optional numerical argument, which must be + less than or equal to 255. A missing argument gives 0. */ + + else + { + int n = 0; + *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */ + parsed_pattern += 3; /* Skip pattern info */ + while (ptr < ptrend && IS_DIGIT(*ptr)) + { + n = n * 10 + *ptr++ - CHAR_0; + if (n > 255) + { + errorcode = ERR38; + goto FAILED; + } + } + *parsed_pattern++ = n; + } + + /* Both formats must have a closing parenthesis */ + + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR39; + goto FAILED; + } + ptr++; + + /* Remember the offset to the next item in the pattern, and set a default + length. This should get updated after the next item is read. */ + + previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); + previous_callout[2] = 0; + break; /* End callout */ + + + /* ---- Conditional group ---- */ + + /* A condition can be an assertion, a number (referring to a numbered + group's having been set), a name (referring to a named group), or 'R', + referring to overall recursion. R and R&name are also permitted + for recursion state tests. Numbers may be preceded by + or - to specify a + relative group number. + + There are several syntaxes for testing a named group: (?(name)) is used + by Python; Perl 5.10 onwards uses (?() or (?('name')). + + There are two unfortunate ambiguities. 'R' can be the recursive thing or + the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be + the Perl DEFINE feature or the Python named test. We look for a name + first; if not found, we try the other case. + + For compatibility with auto-callouts, we allow a callout to be specified + before a condition that is an assertion. */ + + case CHAR_LEFT_PARENTHESIS: + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + nest_depth++; + + /* If the next character is ? or * there must be an assertion next + (optionally preceded by a callout). We do not check this here, but + instead we set expect_cond_assert to 2. If this is still greater than + zero (callouts decrement it) when the next assertion is read, it will be + marked as a condition that must not be repeated. A value greater than + zero also causes checking that an assertion (possibly with callout) + follows. */ + + if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK) + { + *parsed_pattern++ = META_COND_ASSERT; + ptr--; /* Pull pointer back to the opening parenthesis. */ + expect_cond_assert = 2; + break; /* End of conditional */ + } + + /* Handle (?([+-]number)... */ + + if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, + &errorcode)) + { + if (i <= 0) + { + errorcode = ERR15; + goto FAILED; + } + *parsed_pattern++ = META_COND_NUMBER; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); + PUTOFFSET(offset, parsed_pattern); + *parsed_pattern++ = i; + } + else if (errorcode != 0) goto FAILED; /* Number too big */ + + /* No number found. Handle the special case (?(VERSION[>]=n.m)... */ + + else if (ptrend - ptr >= 10 && + PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && + ptr[7] != CHAR_RIGHT_PARENTHESIS) + { + uint32_t ge = 0; + int major = 0; + int minor = 0; + + ptr += 7; + if (*ptr == CHAR_GREATER_THAN_SIGN) + { + ge = 1; + ptr++; + } + + /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT + references its argument twice. */ + + if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) + goto BAD_VERSION_CONDITION; + + if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode)) + goto FAILED; + + if (ptr >= ptrend) goto BAD_VERSION_CONDITION; + if (*ptr == CHAR_DOT) + { + if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION; + minor = (*ptr++ - CHAR_0) * 10; + if (ptr >= ptrend) goto BAD_VERSION_CONDITION; + if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0; + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + goto BAD_VERSION_CONDITION; + } + + *parsed_pattern++ = META_COND_VERSION; + *parsed_pattern++ = ge; + *parsed_pattern++ = major; + *parsed_pattern++ = minor; + } + + /* All the remaining cases now require us to read a name. We cannot at + this stage distinguish ambiguous cases such as (?(R12) which might be a + recursion test by number or a name, because the named groups have not yet + all been identified. Those cases are treated as names, but given a + different META code. */ + + else + { + BOOL was_r_ampersand = FALSE; + + if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND) + { + terminator = CHAR_RIGHT_PARENTHESIS; + was_r_ampersand = TRUE; + ptr++; + } + else if (*ptr == CHAR_LESS_THAN_SIGN) + terminator = CHAR_GREATER_THAN_SIGN; + else if (*ptr == CHAR_APOSTROPHE) + terminator = CHAR_APOSTROPHE; + else + { + terminator = CHAR_RIGHT_PARENTHESIS; + ptr--; /* Point to char before name */ + } + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + + /* Handle (?(R&name) */ + + if (was_r_ampersand) + { + *parsed_pattern = META_COND_RNAME; + ptr--; /* Back to closing parens */ + } + + /* Handle (?(name). If the name is "DEFINE" we identify it with a + special code. Likewise if the name consists of R followed only by + digits. Otherwise, handle it like a quoted name. */ + + else if (terminator == CHAR_RIGHT_PARENTHESIS) + { + if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) + *parsed_pattern = META_COND_DEFINE; + else + { + for (i = 1; i < (int)namelen; i++) + if (!IS_DIGIT(name[i])) break; + *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)? + META_COND_RNUMBER : META_COND_NAME; + } + ptr--; /* Back to closing parens */ + } + + /* Handle (?('name') or (?() */ + + else *parsed_pattern = META_COND_NAME; + + /* All these cases except DEFINE end with the name length and offset; + DEFINE just has an offset (for the "too many branches" error). */ + + if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + } /* End cases that read a name */ + + /* Check the closing parenthesis of the condition */ + + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR24; + goto FAILED; + } + ptr++; + break; /* End of condition processing */ + + + /* ---- Atomic group ---- */ + + case CHAR_GREATER_THAN_SIGN: + ATOMIC_GROUP: /* Come from (*atomic: */ + *parsed_pattern++ = META_ATOMIC; + nest_depth++; + ptr++; + break; + + + /* ---- Lookahead assertions ---- */ + + case CHAR_EQUALS_SIGN: + POSITIVE_LOOK_AHEAD: /* Come from (*pla: */ + *parsed_pattern++ = META_LOOKAHEAD; + ptr++; + goto POST_ASSERTION; + + case CHAR_ASTERISK: + POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */ + *parsed_pattern++ = META_LOOKAHEAD_NA; + ptr++; + goto POST_ASSERTION; + + case CHAR_EXCLAMATION_MARK: + NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */ + *parsed_pattern++ = META_LOOKAHEADNOT; + ptr++; + goto POST_ASSERTION; + + + /* ---- Lookbehind assertions ---- */ + + /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?< + is the start of the name of a capturing group. */ + + case CHAR_LESS_THAN_SIGN: + if (ptrend - ptr <= 1 || + (ptr[1] != CHAR_EQUALS_SIGN && + ptr[1] != CHAR_EXCLAMATION_MARK && + ptr[1] != CHAR_ASTERISK)) + { + terminator = CHAR_GREATER_THAN_SIGN; + goto DEFINE_NAME; + } + *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? + META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)? + META_LOOKBEHINDNOT : META_LOOKBEHIND_NA; + + POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ + *has_lookbehind = TRUE; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); + PUTOFFSET(offset, parsed_pattern); + ptr += 2; + /* Fall through */ + + /* If the previous item was a condition starting (?(? an assertion, + optionally preceded by a callout, is expected. This is checked later on, + during actual compilation. However we need to identify this kind of + assertion in this pass because it must not be qualified. The value of + expect_cond_assert is set to 2 after (?(? is processed. We decrement it + for a callout - still leaving a positive value that identifies the + assertion. Multiple callouts or any other items will make it zero or + less, which doesn't matter because they will cause an error later. */ + + POST_ASSERTION: + nest_depth++; + if (prev_expect_cond_assert > 0) + { + if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); + else if (++top_nest >= end_nests) + { + errorcode = ERR84; + goto FAILED; + } + top_nest->nest_depth = nest_depth; + top_nest->flags = NSF_CONDASSERT; + top_nest->options = options & PARSE_TRACKED_OPTIONS; + top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS; + } + break; + + + /* ---- Define a named group ---- */ + + /* A named group may be defined as (?'name') or (?). In the latter + case we jump to DEFINE_NAME from the disambiguation of (?< above with the + terminator set to '>'. */ + + case CHAR_APOSTROPHE: + terminator = CHAR_APOSTROPHE; /* Terminator */ + + DEFINE_NAME: + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + + /* We have a name for this capturing group. It is also assigned a number, + which is its primary means of identification. */ + + if (cb->bracount >= MAX_GROUP_NUMBER) + { + errorcode = ERR97; + goto FAILED; + } + cb->bracount++; + *parsed_pattern++ = META_CAPTURE | cb->bracount; + nest_depth++; + + /* Check not too many names */ + + if (cb->names_found >= MAX_NAME_COUNT) + { + errorcode = ERR49; + goto FAILED; + } + + /* Adjust the entry size to accommodate the longest name found. */ + + if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) + cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1); + + /* Scan the list to check for duplicates. For duplicate names, if the + number is the same, break the loop, which causes the name to be + discarded; otherwise, if DUPNAMES is not set, give an error. + If it is set, allow the name with a different number, but continue + scanning in case this is a duplicate with the same number. For + non-duplicate names, give an error if the number is duplicated. */ + + isdupname = FALSE; + ng = cb->named_groups; + for (i = 0; i < cb->names_found; i++, ng++) + { + if (namelen == ng->length && + PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0) + { + if (ng->number == cb->bracount) break; + if ((options & PCRE2_DUPNAMES) == 0) + { + errorcode = ERR43; + goto FAILED; + } + isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ + cb->dupnames = TRUE; /* Duplicate names exist */ + } + else if (ng->number == cb->bracount) + { + errorcode = ERR65; + goto FAILED; + } + } + + if (i < cb->names_found) break; /* Ignore duplicate with same number */ + + /* Increase the list size if necessary */ + + if (cb->names_found >= cb->named_group_list_size) + { + uint32_t newsize = cb->named_group_list_size * 2; + named_group *newspace = + cb->cx->memctl.malloc(newsize * sizeof(named_group), + cb->cx->memctl.memory_data); + if (newspace == NULL) + { + errorcode = ERR21; + goto FAILED; + } + + memcpy(newspace, cb->named_groups, + cb->named_group_list_size * sizeof(named_group)); + if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) + cb->cx->memctl.free((void *)cb->named_groups, + cb->cx->memctl.memory_data); + cb->named_groups = newspace; + cb->named_group_list_size = newsize; + } + + /* Add this name to the list */ + + cb->named_groups[cb->names_found].name = name; + cb->named_groups[cb->names_found].length = (uint16_t)namelen; + cb->named_groups[cb->names_found].number = cb->bracount; + cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname; + cb->names_found++; + break; + } /* End of (? switch */ + break; /* End of ( handling */ + + + /* ---- Branch terminators ---- */ + + /* Alternation: reset the capture count if we are in a (?| group. */ + + case CHAR_VERTICAL_LINE: + if (top_nest != NULL && top_nest->nest_depth == nest_depth && + (top_nest->flags & NSF_RESET) != 0) + { + if (cb->bracount > top_nest->max_group) + top_nest->max_group = (uint16_t)cb->bracount; + cb->bracount = top_nest->reset_group; + } + *parsed_pattern++ = META_ALT; + break; + + /* End of group; reset the capture count to the maximum if we are in a (?| + group and/or reset the options that are tracked during parsing. Disallow + quantifier for a condition that is an assertion. */ + + case CHAR_RIGHT_PARENTHESIS: + okquantifier = TRUE; + if (top_nest != NULL && top_nest->nest_depth == nest_depth) + { + options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options; + xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions; + if ((top_nest->flags & NSF_RESET) != 0 && + top_nest->max_group > cb->bracount) + cb->bracount = top_nest->max_group; + if ((top_nest->flags & NSF_CONDASSERT) != 0) + okquantifier = FALSE; + + if ((top_nest->flags & NSF_ATOMICSR) != 0) + { + *parsed_pattern++ = META_KET; + } + + if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; + else top_nest--; + } + if (nest_depth == 0) /* Unmatched closing parenthesis */ + { + errorcode = ERR22; + goto FAILED_BACK; + } + nest_depth--; + *parsed_pattern++ = META_KET; + break; + } /* End of switch on pattern character */ + } /* End of main character scan loop */ + +/* End of pattern reached. Check for missing ) at the end of a verb name. */ + +if (inverbname && ptr >= ptrend) + { + errorcode = ERR60; + goto FAILED; + } + +/* Manage callout for the final item */ + +PARSED_END: +parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, + parsed_pattern, cb); + +/* Insert trailing items for word and line matching (features provided for the +benefit of pcre2grep). */ + +if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0) + { + *parsed_pattern++ = META_KET; + *parsed_pattern++ = META_DOLLAR; + } +else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0) + { + *parsed_pattern++ = META_KET; + *parsed_pattern++ = META_ESCAPE + ESC_b; + } + +/* Terminate the parsed pattern, then return success if all groups are closed. +Otherwise we have unclosed parentheses. */ + +if (parsed_pattern >= parsed_pattern_end) + { + errorcode = ERR63; /* Internal error (parsed pattern overflow) */ + goto FAILED; + } + +*parsed_pattern = META_END; +if (nest_depth == 0) return 0; + +UNCLOSED_PARENTHESIS: +errorcode = ERR14; + +/* Come here for all failures. */ + +FAILED: +cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern); +return errorcode; + +/* Some errors need to indicate the previous character. */ + +FAILED_BACK: +ptr--; +goto FAILED; + +/* This failure happens several times. */ + +BAD_VERSION_CONDITION: +errorcode = ERR79; +goto FAILED; +} + + + +/************************************************* +* Find first significant opcode * +*************************************************/ + +/* This is called by several functions that scan a compiled expression looking +for a fixed first character, or an anchoring opcode etc. It skips over things +that do not influence this. For some calls, it makes sense to skip negative +forward and all backward assertions, and also the \b assertion; for others it +does not. + +Arguments: + code pointer to the start of the group + skipassert TRUE if certain assertions are to be skipped + +Returns: pointer to the first significant opcode +*/ + +static const PCRE2_UCHAR* +first_significant_code(PCRE2_SPTR code, BOOL skipassert) +{ +for (;;) + { + switch ((int)*code) + { + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + case OP_ASSERTBACK_NA: + if (!skipassert) return code; + do code += GET(code, 1); while (*code == OP_ALT); + code += PRIV(OP_lengths)[*code]; + break; + + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + if (!skipassert) return code; + /* Fall through */ + + case OP_CALLOUT: + case OP_CREF: + case OP_DNCREF: + case OP_RREF: + case OP_DNRREF: + case OP_FALSE: + case OP_TRUE: + code += PRIV(OP_lengths)[*code]; + break; + + case OP_CALLOUT_STR: + code += GET(code, 1 + 2*LINK_SIZE); + break; + + case OP_SKIPZERO: + code += 2 + GET(code, 2) + LINK_SIZE; + break; + + case OP_COND: + case OP_SCOND: + if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */ + code[GET(code, 1)] != OP_KET) /* More than one branch */ + return code; + code += GET(code, 1) + 1 + LINK_SIZE; + break; + + case OP_MARK: + case OP_COMMIT_ARG: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1] + PRIV(OP_lengths)[*code]; + break; + + default: + return code; + } + } +/* Control never reaches here */ +} + + + +#ifdef SUPPORT_UNICODE +/************************************************* +* Get othercase range * +*************************************************/ + +/* This function is passed the start and end of a class range in UCP mode. For +single characters the range may be just one character long. The function +searches up the characters, looking for ranges of characters in the "other" +case. Each call returns the next one, updating the start address. A character +with multiple other cases is returned on its own with a special return value. + +Arguments: + cptr points to starting character value; updated + d end value + ocptr where to put start of othercase range + odptr where to put end of othercase range + restricted TRUE if caseless restriction applies + +Yield: -1 when no more + 0 when a range is returned + >0 the CASESET offset for char with multiple other cases; + for this return, *ocptr contains the original +*/ + +static int +get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, + uint32_t *odptr, BOOL restricted) +{ +uint32_t c, othercase, next; +unsigned int co; + +/* Find the first character that has an other case. If it has multiple other +cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the +multi-case entries that begin with ASCII values. */ + +for (c = *cptr; c <= d; c++) + { + if ((co = UCD_CASESET(c)) != 0 && + (!restricted || PRIV(ucd_caseless_sets)[co] > 127)) + { + *ocptr = c++; /* Character that has the set */ + *cptr = c; /* Rest of input range */ + return (int)co; + } + + /* This is not a valid multiple-case character. Check that the single other + case is different to the original. We don't need to check "restricted" here + because the non-ASCII characters with multiple cases that include an ASCII + character don't have a different "othercase". */ + + if ((othercase = UCD_OTHERCASE(c)) != c) break; + } + +if (c > d) return -1; /* Reached end of range */ + +/* Found a character that has a single other case. Search for the end of the +range, which is either the end of the input range, or a character that has zero +or more than one other cases. */ + +*ocptr = othercase; +next = othercase + 1; + +for (++c; c <= d; c++) + { + if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; + next++; + } + +*odptr = next - 1; /* End of othercase range */ +*cptr = c; /* Rest of input range */ +return 0; +} +#endif /* SUPPORT_UNICODE */ + + + +/************************************************* +* Add a character or range to a class (internal) * +*************************************************/ + +/* This function packages up the logic of adding a character or range of +characters to a class. The character values in the arguments will be within the +valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is +called only from within the "add to class" group of functions, some of which +are recursive and mutually recursive. The external entry point is +add_to_class(). + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options bits + xoptions the extra options bits + cb compile data + start start of range character + end end of range character + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static unsigned int +add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start, + uint32_t end) +{ +uint32_t c; +uint32_t classbits_end = (end <= 0xff ? end : 0xff); +unsigned int n8 = 0; + +/* If caseless matching is required, scan the range and process alternate +cases. In Unicode, there are 8-bit characters that have alternate cases that +are greater than 255 and vice-versa (though these may be ignored if caseless +restriction is in force). Sometimes we can just extend the original range. */ + +if ((options & PCRE2_CASELESS) != 0) + { +#ifdef SUPPORT_UNICODE + if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) + { + int rc; + uint32_t oc, od; + + options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ + c = start; + + while ((rc = get_othercase_range(&c, end, &oc, &od, + (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0) + { + /* Handle a single character that has more than one other case. */ + + if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, + options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc); + + /* Do nothing if the other case range is within the original range. */ + + else if (oc >= cb->class_range_start && od <= cb->class_range_end) + continue; + + /* Extend the original range if there is overlap, noting that if oc < c, + we can't have od > end because a subrange is always shorter than the + basic range. Otherwise, use a recursive call to add the additional range. + */ + + else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ + else if (od > end && oc <= end + 1) + { + end = od; /* Extend upwards */ + if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); + } + else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, + cb, oc, od); + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF mode */ + + for (c = start; c <= classbits_end; c++) + { + SETBIT(classbits, cb->fcc[c]); + n8++; + } + } + +/* Now handle the originally supplied range. Adjust the final value according +to the bit length - this means that the same lists of (e.g.) horizontal spaces +can be used in all cases. */ + +if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) + end = MAX_NON_UTF_CHAR; + +if (start > cb->class_range_start && end < cb->class_range_end) return n8; + +/* Use the bitmap for characters < 256. Otherwise use extra data.*/ + +for (c = start; c <= classbits_end; c++) + { + /* Regardless of start, c will always be <= 255. */ + SETBIT(classbits, c); + n8++; + } + +#ifdef SUPPORT_WIDE_CHARS +if (start <= 0xff) start = 0xff + 1; + +if (end >= start) + { + PCRE2_UCHAR *uchardata = *uchardptr; + +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UTF) != 0) + { + if (start < end) + { + *uchardata++ = XCL_RANGE; + uchardata += PRIV(ord2utf)(start, uchardata); + uchardata += PRIV(ord2utf)(end, uchardata); + } + else if (start == end) + { + *uchardata++ = XCL_SINGLE; + uchardata += PRIV(ord2utf)(start, uchardata); + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Without UTF support, character values are constrained by the bit length, + and can only be > 256 for 16-bit and 32-bit libraries. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + {} +#else + if (start < end) + { + *uchardata++ = XCL_RANGE; + *uchardata++ = start; + *uchardata++ = end; + } + else if (start == end) + { + *uchardata++ = XCL_SINGLE; + *uchardata++ = start; + } +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + *uchardptr = uchardata; /* Updata extra data pointer */ + } +#else /* SUPPORT_WIDE_CHARS */ + (void)uchardptr; /* Avoid compiler warning */ +#endif /* SUPPORT_WIDE_CHARS */ + +return n8; /* Number of 8-bit characters */ +} + + + +#ifdef SUPPORT_UNICODE +/************************************************* +* Add a list of characters to a class (internal) * +*************************************************/ + +/* This function is used for adding a list of case-equivalent characters to a +class when in UTF mode. This function is called only from within +add_to_class_internal(), with which it is mutually recursive. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options bits + xoptions the extra options bits + cb contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + except character to omit; this is used when adding lists of + case-equivalent characters to avoid including the one we + already know about + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static unsigned int +add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p, + unsigned int except) +{ +unsigned int n8 = 0; +while (p[0] < NOTACHAR) + { + unsigned int n = 0; + if (p[0] != except) + { + while(p[n+1] == p[0] + n + 1) n++; + n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb, + p[0], p[n]); + } + p += n + 1; + } +return n8; +} +#endif + + + +/************************************************* +* External entry point for add range to class * +*************************************************/ + +/* This function sets the overall range so that the internal functions can try +to avoid duplication when handling case-independence. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options bits + xoptions the extra options bits + cb compile data + start start of range character + end end of range character + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static unsigned int +add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, + uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end) +{ +cb->class_range_start = start; +cb->class_range_end = end; +return add_to_class_internal(classbits, uchardptr, options, xoptions, cb, + start, end); +} + + +/************************************************* +* External entry point for add list to class * +*************************************************/ + +/* This function is used for adding a list of horizontal or vertical whitespace +characters to a class. The list must be in order so that ranges of characters +can be detected and handled appropriately. This function sets the overall range +so that the internal functions can try to avoid duplication when handling +case-independence. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options bits + xoptions the extra options bits + cb contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + except character to omit; this is used when adding lists of + case-equivalent characters to avoid including the one we + already know about + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static unsigned int +add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, + uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except) +{ +unsigned int n8 = 0; +while (p[0] < NOTACHAR) + { + unsigned int n = 0; + if (p[0] != except) + { + while(p[n+1] == p[0] + n + 1) n++; + cb->class_range_start = p[0]; + cb->class_range_end = p[n]; + n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb, + p[0], p[n]); + } + p += n + 1; + } +return n8; +} + + + +/************************************************* +* Add characters not in a list to a class * +*************************************************/ + +/* This function is used for adding the complement of a list of horizontal or +vertical whitespace to a class. The list must be in order. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options bits + xoptions the extra options bits + cb contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static unsigned int +add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p) +{ +BOOL utf = (options & PCRE2_UTF) != 0; +unsigned int n8 = 0; +if (p[0] > 0) + n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1); +while (p[0] < NOTACHAR) + { + while (p[1] == p[0] + 1) p++; + n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1, + (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); + p++; + } +return n8; +} + + + +/************************************************* +* Find details of duplicate group names * +*************************************************/ + +/* This is called from compile_branch() when it needs to know the index and +count of duplicates in the names table when processing named backreferences, +either directly, or as conditions. + +Arguments: + name points to the name + length the length of the name + indexptr where to put the index + countptr where to put the count of duplicates + errorcodeptr where to put an error code + cb the compile block + +Returns: TRUE if OK, FALSE if not, error code set +*/ + +static BOOL +find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, + int *countptr, int *errorcodeptr, compile_block *cb) +{ +uint32_t i, groupnumber; +int count; +PCRE2_UCHAR *slot = cb->name_table; + +/* Find the first entry in the table */ + +for (i = 0; i < cb->names_found; i++) + { + if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 && + slot[IMM2_SIZE+length] == 0) break; + slot += cb->name_entry_size; + } + +/* This should not occur, because this function is called only when we know we +have duplicate names. Give an internal error. */ + +if (i >= cb->names_found) + { + *errorcodeptr = ERR53; + cb->erroroffset = name - cb->start_pattern; + return FALSE; + } + +/* Record the index and then see how many duplicates there are, updating the +backref map and maximum back reference as we do. */ + +*indexptr = i; +count = 0; + +for (;;) + { + count++; + groupnumber = GET2(slot,0); + cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; + if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; + if (++i >= cb->names_found) break; + slot += cb->name_entry_size; + if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 || + (slot+IMM2_SIZE)[length] != 0) break; + } + +*countptr = count; +return TRUE; +} + + + +/************************************************* +* Compile one branch * +*************************************************/ + +/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If +the options are changed during the branch, the pointer is used to change the +external options bits. This function is used during the pre-compile phase when +we are trying to find out the amount of memory needed, as well as during the +real compile phase. The value of lengthptr distinguishes the two phases. + +Arguments: + optionsptr pointer to the option bits + xoptionsptr pointer to the extra option bits + codeptr points to the pointer to the current code point + pptrptr points to the current parsed pattern pointer + errorcodeptr points to error code variable + firstcuptr place to put the first required code unit + firstcuflagsptr place to put the first code unit flags + reqcuptr place to put the last required code unit + reqcuflagsptr place to put the last required code unit flags + bcptr points to current branch chain + cb contains pointers to tables etc. + lengthptr NULL during the real compile phase + points to length accumulator during pre-compile phase + +Returns: 0 There's been an error, *errorcodeptr is non-zero + +1 Success, this branch must match at least one character + -1 Success, this branch may match an empty string +*/ + +static int +compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr, + PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr, + uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr, + uint32_t *reqcuflagsptr, branch_chain *bcptr, compile_block *cb, + PCRE2_SIZE *lengthptr) +{ +int bravalue = 0; +int okreturn = -1; +int group_return = 0; +uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */ +uint32_t greedy_default, greedy_non_default; +uint32_t repeat_type, op_type; +uint32_t options = *optionsptr; /* May change dynamically */ +uint32_t xoptions = *xoptionsptr; /* May change dynamically */ +uint32_t firstcu, reqcu; +uint32_t zeroreqcu, zerofirstcu; +uint32_t escape; +uint32_t *pptr = *pptrptr; +uint32_t meta, meta_arg; +uint32_t firstcuflags, reqcuflags; +uint32_t zeroreqcuflags, zerofirstcuflags; +uint32_t req_caseopt, reqvary, tempreqvary; +PCRE2_SIZE offset = 0; +PCRE2_SIZE length_prevgroup = 0; +PCRE2_UCHAR *code = *codeptr; +PCRE2_UCHAR *last_code = code; +PCRE2_UCHAR *orig_code = code; +PCRE2_UCHAR *tempcode; +PCRE2_UCHAR *previous = NULL; +PCRE2_UCHAR op_previous; +BOOL groupsetfirstcu = FALSE; +BOOL had_accept = FALSE; +BOOL matched_char = FALSE; +BOOL previous_matched_char = FALSE; +BOOL reset_caseful = FALSE; +const uint8_t *cbits = cb->cbits; +uint8_t classbits[32]; + +/* We can fish out the UTF setting once and for all into a BOOL, but we must +not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically +as we process the pattern. */ + +#ifdef SUPPORT_UNICODE +BOOL utf = (options & PCRE2_UTF) != 0; +BOOL ucp = (options & PCRE2_UCP) != 0; +#else /* No Unicode support */ +BOOL utf = FALSE; +#endif + +/* Helper variables for OP_XCLASS opcode (for characters > 255). We define +class_uchardata always so that it can be passed to add_to_class() always, +though it will not be used in non-UTF 8-bit cases. This avoids having to supply +alternative calls for the different cases. */ + +PCRE2_UCHAR *class_uchardata; +#ifdef SUPPORT_WIDE_CHARS +BOOL xclass; +PCRE2_UCHAR *class_uchardata_base; +#endif + +/* Set up the default and non-default settings for greediness */ + +greedy_default = ((options & PCRE2_UNGREEDY) != 0); +greedy_non_default = greedy_default ^ 1; + +/* Initialize no first unit, no required unit. REQ_UNSET means "no char +matching encountered yet". It gets changed to REQ_NONE if we hit something that +matches a non-fixed first unit; reqcu just remains unset if we never find one. + +When we hit a repeat whose minimum is zero, we may have to adjust these values +to take the zero repeat into account. This is implemented by setting them to +zerofirstcu and zeroreqcu when such a repeat is encountered. The individual +item types that can be repeated set these backoff variables appropriately. */ + +firstcu = reqcu = zerofirstcu = zeroreqcu = 0; +firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; + +/* The variable req_caseopt contains either the REQ_CASELESS bit or zero, +according to the current setting of the caseless flag. The REQ_CASELESS value +leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables +to record the case status of the value. This is used only for ASCII characters. +*/ + +req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; + +/* Switch on next META item until the end of the branch */ + +for (;; pptr++) + { +#ifdef SUPPORT_WIDE_CHARS + BOOL xclass_has_prop; +#endif + BOOL negate_class; + BOOL should_flip_negation; + BOOL match_all_or_no_wide_chars; + BOOL possessive_quantifier; + BOOL note_group_empty; + int class_has_8bitchar; + uint32_t mclength; + uint32_t skipunits; + uint32_t subreqcu, subfirstcu; + uint32_t groupnumber; + uint32_t verbarglen, verbculen; + uint32_t subreqcuflags, subfirstcuflags; + open_capitem *oc; + PCRE2_UCHAR mcbuffer[8]; + + /* Get next META item in the pattern and its potential argument. */ + + meta = META_CODE(*pptr); + meta_arg = META_DATA(*pptr); + + /* If we are in the pre-compile phase, accumulate the length used for the + previous cycle of this loop, unless the next item is a quantifier. */ + + if (lengthptr != NULL) + { + if (code > cb->start_workspace + cb->workspace_size - + WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ + { + *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)? + ERR52 : ERR86; + return 0; + } + + /* There is at least one situation where code goes backwards: this is the + case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier + is processed, the whole class is eliminated. However, it is created first, + so we have to allow memory for it. Therefore, don't ever reduce the length + at this point. */ + + if (code < last_code) code = last_code; + + /* If the next thing is not a quantifier, we add the length of the previous + item into the total, and reset the code pointer to the start of the + workspace. Otherwise leave the previous item available to be quantified. */ + + if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) + { + if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code)) + { + *errorcodeptr = ERR20; /* Integer overflow */ + return 0; + } + *lengthptr += (PCRE2_SIZE)(code - orig_code); + if (*lengthptr > MAX_PATTERN_SIZE) + { + *errorcodeptr = ERR20; /* Pattern is too large */ + return 0; + } + code = orig_code; + } + + /* Remember where this code item starts so we can catch the "backwards" + case above next time round. */ + + last_code = code; + } + + /* Process the next parsed pattern item. If it is not a quantifier, remember + where it starts so that it can be quantified when a quantifier follows. + Checking for the legality of quantifiers happens in parse_regex(), except for + a quantifier after an assertion that is a condition. */ + + if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) + { + previous = code; + if (matched_char && !had_accept) okreturn = 1; + } + + previous_matched_char = matched_char; + matched_char = FALSE; + note_group_empty = FALSE; + skipunits = 0; /* Default value for most subgroups */ + + switch(meta) + { + /* ===================================================================*/ + /* The branch terminates at pattern end or | or ) */ + + case META_END: + case META_ALT: + case META_KET: + *firstcuptr = firstcu; + *firstcuflagsptr = firstcuflags; + *reqcuptr = reqcu; + *reqcuflagsptr = reqcuflags; + *codeptr = code; + *pptrptr = pptr; + return okreturn; + + + /* ===================================================================*/ + /* Handle single-character metacharacters. In multiline mode, ^ disables + the setting of any following char as a first character. */ + + case META_CIRCUMFLEX: + if ((options & PCRE2_MULTILINE) != 0) + { + if (firstcuflags == REQ_UNSET) + zerofirstcuflags = firstcuflags = REQ_NONE; + *code++ = OP_CIRCM; + } + else *code++ = OP_CIRC; + break; + + case META_DOLLAR: + *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; + break; + + /* There can never be a first char if '.' is first, whatever happens about + repeats. The value of reqcu doesn't change either. */ + + case META_DOT: + matched_char = TRUE; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; + break; + + + /* ===================================================================*/ + /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. + Otherwise, an initial ']' is taken as a data character. When empty classes + are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must + match any character, so generate OP_ALLANY. */ + + case META_CLASS_EMPTY: + case META_CLASS_EMPTY_NOT: + matched_char = TRUE; + *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + break; + + + /* ===================================================================*/ + /* Non-empty character class. If the included characters are all < 256, we + build a 32-byte bitmap of the permitted characters, except in the special + case where there is only one such character. For negated classes, we build + the map as usual, then invert it at the end. However, we use a different + opcode so that data characters > 255 can be handled correctly. + + If the class contains characters outside the 0-255 range, a different + opcode is compiled. It may optionally have a bit map for characters < 256, + but those above are are explicitly listed afterwards. A flag code unit + tells whether the bitmap is present, and whether this is a negated class or + not. */ + + case META_CLASS_NOT: + case META_CLASS: + matched_char = TRUE; + negate_class = meta == META_CLASS_NOT; + + /* We can optimize the case of a single character in a class by generating + OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's + negative. In the negative case there can be no first char if this item is + first, whatever repeat count may follow. In the case of reqcu, save the + previous value for reinstating. */ + + /* NOTE: at present this optimization is not effective if the only + character in a class in 32-bit, non-UCP mode has its top bit set. */ + + if (pptr[1] < META_END && pptr[2] == META_CLASS_END) + { +#ifdef SUPPORT_UNICODE + uint32_t d; +#endif + uint32_t c = pptr[1]; + + pptr += 2; /* Move on to class end */ + if (meta == META_CLASS) /* A positive one-char class can be */ + { /* handled as a normal literal character. */ + meta = c; /* Set up the character */ + goto NORMAL_CHAR_SET; + } + + /* Handle a negative one-character class */ + + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + + /* For caseless UTF or UCP mode, check whether this character has more + than one other case. If so, generate a special OP_NOTPROP item instead of + OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any + caseless set that starts with an ASCII character. */ + +#ifdef SUPPORT_UNICODE + if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 && + (d = UCD_CASESET(c)) != 0 && + ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 || + PRIV(ucd_caseless_sets)[d] > 127)) + { + *code++ = OP_NOTPROP; + *code++ = PT_CLIST; + *code++ = d; + break; /* We are finished with this class */ + } +#endif + /* Char has only one other (usable) case, or UCP not available */ + + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; + code += PUTCHAR(c, code); + break; /* We are finished with this class */ + } /* End of 1-char optimization */ + + /* Handle character classes that contain more than just one literal + character. If there are exactly two characters in a positive class, see if + they are case partners. This can be optimized to generate a caseless single + character match (which also sets first/required code units if relevant). + When casing restrictions apply, ignore a caseless set if both characters + are ASCII. */ + + if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END && + pptr[3] == META_CLASS_END) + { + uint32_t c = pptr[1]; + +#ifdef SUPPORT_UNICODE + if (UCD_CASESET(c) == 0 || + ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 && + c < 128 && pptr[2] < 128)) +#endif + { + uint32_t d; + +#ifdef SUPPORT_UNICODE + if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else +#endif + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) d = c; else +#endif + d = TABLE_GET(c, cb->fcc, c); + } + + if (c != d && pptr[2] == d) + { + pptr += 3; /* Move on to class end */ + meta = c; + if ((options & PCRE2_CASELESS) == 0) + { + reset_caseful = TRUE; + options |= PCRE2_CASELESS; + req_caseopt = REQ_CASELESS; + } + goto CLASS_CASELESS_CHAR; + } + } + } + + /* If a non-extended class contains a negative special such as \S, we need + to flip the negation flag at the end, so that support for characters > 255 + works correctly (they are all included in the class). An extended class may + need to insert specific matching or non-matching code for wide characters. + */ + + should_flip_negation = match_all_or_no_wide_chars = FALSE; + + /* Extended class (xclass) will be used when characters > 255 + might match. */ + +#ifdef SUPPORT_WIDE_CHARS + xclass = FALSE; + class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ + class_uchardata_base = class_uchardata; /* Save the start */ +#endif + + /* For optimization purposes, we track some properties of the class: + class_has_8bitchar will be non-zero if the class contains at least one + character with a code point less than 256; xclass_has_prop will be TRUE if + Unicode property checks are present in the class. */ + + class_has_8bitchar = 0; +#ifdef SUPPORT_WIDE_CHARS + xclass_has_prop = FALSE; +#endif + + /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map + in a temporary bit of memory, in case the class contains fewer than two + 8-bit characters because in that case the compiled code doesn't use the bit + map. */ + + memset(classbits, 0, 32 * sizeof(uint8_t)); + + /* Process items until META_CLASS_END is reached. */ + + while ((meta = *(++pptr)) != META_CLASS_END) + { + /* Handle POSIX classes such as [:alpha:] etc. */ + + if (meta == META_POSIX || meta == META_POSIX_NEG) + { + BOOL local_negate = (meta == META_POSIX_NEG); + int posix_class = *(++pptr); + int taboffset, tabopt; + uint8_t pbits[32]; + + should_flip_negation = local_negate; /* Note negative special */ + + /* If matching is caseless, upper and lower are converted to alpha. + This relies on the fact that the class table starts with alpha, + lower, upper as the first 3 entries. */ + + if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) + posix_class = 0; + + /* When PCRE2_UCP is set, some of the POSIX classes are converted to + different escape sequences that use Unicode properties \p or \P. + Others that are not available via \p or \P have to generate + XCL_PROP/XCL_NOTPROP directly, which is done here. */ + +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UCP) != 0 && + (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) + { + switch(posix_class) + { + case PC_GRAPH: + case PC_PRINT: + case PC_PUNCT: + *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; + *class_uchardata++ = (PCRE2_UCHAR) + ((posix_class == PC_GRAPH)? PT_PXGRAPH : + (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); + *class_uchardata++ = 0; + xclass_has_prop = TRUE; + goto CONTINUE_CLASS; + + /* For the other POSIX classes (ascii, xdigit) we are going to + fall through to the non-UCP case and build a bit map for + characters with code points less than 256. However, if we are in + a negated POSIX class, characters with code points greater than + 255 must either all match or all not match, depending on whether + the whole class is not or is negated. For example, for + [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]... + they must not. + + In the special case where there are no xclass items, this is + automatically handled by the use of OP_CLASS or OP_NCLASS, but an + explicit range is needed for OP_XCLASS. Setting a flag here + causes the range to be generated later when it is known that + OP_XCLASS is required. In the 8-bit library this is relevant only in + utf mode, since no wide characters can exist otherwise. */ + + default: +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf) +#endif + match_all_or_no_wide_chars |= local_negate; + break; + } + } +#endif /* SUPPORT_UNICODE */ + + /* In the non-UCP case, or when UCP makes no difference, we build the + bit map for the POSIX class in a chunk of local store because we may + be adding and subtracting from it, and we don't want to subtract bits + that may be in the main map already. At the end we or the result into + the bit map that is being built. */ + + posix_class *= 3; + + /* Copy in the first table (always present) */ + + memcpy(pbits, cbits + posix_class_maps[posix_class], + 32 * sizeof(uint8_t)); + + /* If there is a second table, add or remove it as required. */ + + taboffset = posix_class_maps[posix_class + 1]; + tabopt = posix_class_maps[posix_class + 2]; + + if (taboffset >= 0) + { + if (tabopt >= 0) + for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset]; + else + for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset]; + } + + /* Now see if we need to remove any special characters. An option + value of 1 removes vertical space and 2 removes underscore. */ + + if (tabopt < 0) tabopt = -tabopt; + if (tabopt == 1) pbits[1] &= ~0x3c; + else if (tabopt == 2) pbits[11] &= 0x7f; + + /* Add the POSIX table or its complement into the main table that is + being built and we are done. */ + + if (local_negate) + for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]); + else + for (int i = 0; i < 32; i++) classbits[i] |= pbits[i]; + + /* Every class contains at least one < 256 character. */ + + class_has_8bitchar = 1; + goto CONTINUE_CLASS; /* End of POSIX handling */ + } + + /* Other than POSIX classes, the only items we should encounter are + \d-type escapes and literal characters (possibly as ranges). */ + + if (meta == META_BIGVALUE) + { + meta = *(++pptr); + goto CLASS_LITERAL; + } + + /* Any other non-literal must be an escape */ + + if (meta >= META_END) + { + if (META_CODE(meta) != META_ESCAPE) + { +#ifdef DEBUG_SHOW_PARSED + fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x " + "in character class\n", meta); +#endif + *errorcodeptr = ERR89; /* Internal error - unrecognized. */ + return 0; + } + escape = META_DATA(meta); + + /* Every class contains at least one < 256 character. */ + + class_has_8bitchar++; + + switch(escape) + { + case ESC_d: + for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; + break; + + case ESC_D: + should_flip_negation = TRUE; + for (int i = 0; i < 32; i++) + classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]); + break; + + case ESC_w: + for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; + break; + + case ESC_W: + should_flip_negation = TRUE; + for (int i = 0; i < 32; i++) + classbits[i] |= (uint8_t)(~cbits[i+cbit_word]); + break; + + /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl + 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was + previously set by something earlier in the character class. + Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so + we could just adjust the appropriate bit. From PCRE 8.34 we no + longer treat \s and \S specially. */ + + case ESC_s: + for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; + break; + + case ESC_S: + should_flip_negation = TRUE; + for (int i = 0; i < 32; i++) + classbits[i] |= (uint8_t)(~cbits[i+cbit_space]); + break; + + /* When adding the horizontal or vertical space lists to a class, or + their complements, disable PCRE2_CASELESS, because it justs wastes + time, and in the "not-x" UTF cases can create unwanted duplicates in + the XCLASS list (provoked by characters that have more than one other + case and by both cases being in the same "not-x" sublist). */ + + case ESC_h: + (void)add_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list), + NOTACHAR); + break; + + case ESC_H: + (void)add_not_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list)); + break; + + case ESC_v: + (void)add_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list), + NOTACHAR); + break; + + case ESC_V: + (void)add_not_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list)); + break; + + /* If Unicode is not supported, \P and \p are not allowed and are + faulted at parse time, so will never appear here. */ + +#ifdef SUPPORT_UNICODE + case ESC_p: + case ESC_P: + { + uint32_t ptype = *(++pptr) >> 16; + uint32_t pdata = *pptr & 0xffff; + *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; + *class_uchardata++ = ptype; + *class_uchardata++ = pdata; + xclass_has_prop = TRUE; + class_has_8bitchar--; /* Undo! */ + } + break; +#endif + } + + goto CONTINUE_CLASS; + } /* End handling \d-type escapes */ + + /* A literal character may be followed by a range meta. At parse time + there are checks for out-of-order characters, for ranges where the two + characters are equal, and for hyphens that cannot indicate a range. At + this point, therefore, no checking is needed. */ + + else + { + uint32_t c, d; + + CLASS_LITERAL: + c = d = meta; + + /* Remember if \r or \n were explicitly used */ + + if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; + + /* Process a character range */ + + if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED) + { +#ifdef EBCDIC + BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL); +#endif + pptr += 2; + d = *pptr; + if (d == META_BIGVALUE) d = *(++pptr); + + /* Remember an explicit \r or \n, and add the range to the class. */ + + if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; + + /* In an EBCDIC environment, Perl treats alphabetic ranges specially + because there are holes in the encoding, and simply using the range + A-Z (for example) would include the characters in the holes. This + applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ + +#ifdef EBCDIC + if (range_is_literal && + (cb->ctypes[c] & ctype_letter) != 0 && + (cb->ctypes[d] & ctype_letter) != 0 && + (c <= CHAR_z) == (d <= CHAR_z)) + { + uint32_t uc = (d <= CHAR_z)? 0 : 64; + uint32_t C = c - uc; + uint32_t D = d - uc; + + if (C <= CHAR_i) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, xoptions, + cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc); + C = CHAR_j; + } + + if (C <= D && C <= CHAR_r) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, xoptions, + cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc); + C = CHAR_s; + } + + if (C <= D) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, xoptions, + cb, C + uc, D + uc); + } + } + else +#endif + /* Not an EBCDIC special range */ + + class_has_8bitchar += add_to_class(classbits, &class_uchardata, + options, xoptions, cb, c, d); + goto CONTINUE_CLASS; /* Go get the next char in the class */ + } /* End of range handling */ + + + /* Handle a single character. */ + + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, xoptions, cb, + meta, meta); + } + + /* Continue to the next item in the class. */ + + CONTINUE_CLASS: + +#ifdef SUPPORT_WIDE_CHARS + /* If any wide characters or Unicode properties have been encountered, + set xclass = TRUE. Then, in the pre-compile phase, accumulate the length + of the extra data and reset the pointer. This is so that very large + classes that contain a zillion wide characters or Unicode property tests + do not overwrite the workspace (which is on the stack). */ + + if (class_uchardata > class_uchardata_base) + { + xclass = TRUE; + if (lengthptr != NULL) + { + *lengthptr += class_uchardata - class_uchardata_base; + class_uchardata = class_uchardata_base; + } + } +#endif + + continue; /* Needed to avoid error when not supporting wide chars */ + } /* End of main class-processing loop */ + + /* If this class is the first thing in the branch, there can be no first + char setting, whatever the repeat count. Any reqcu setting must remain + unchanged after any kind of repeat. */ + + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + + /* If there are characters with values > 255, or Unicode property settings + (\p or \P), we have to compile an extended class, with its own opcode, + unless there were no property settings and there was a negated special such + as \S in the class, and PCRE2_UCP is not set, because in that case all + characters > 255 are in or not in the class, so any that were explicitly + given as well can be ignored. + + In the UCP case, if certain negated POSIX classes ([:^ascii:] or + [^:xdigit:]) were present in a class, we either have to match or not match + all wide characters (depending on whether the whole class is or is not + negated). This requirement is indicated by match_all_or_no_wide_chars being + true. We do this by including an explicit range, which works in both cases. + This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there + cannot be any wide characters in 8-bit non-UTF mode. + + When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit + class where \S etc is present without PCRE2_UCP, causing an extended class + to be compiled, we make sure that all characters > 255 are included by + forcing match_all_or_no_wide_chars to be true. + + If, when generating an xclass, there are no characters < 256, we can omit + the bitmap in the actual compiled code. */ + +#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ + if (xclass && ( +#ifdef SUPPORT_UNICODE + (options & PCRE2_UCP) != 0 || +#endif + xclass_has_prop || !should_flip_negation)) + { + if (match_all_or_no_wide_chars || ( +#if PCRE2_CODE_UNIT_WIDTH == 8 + utf && +#endif + should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) + { + *class_uchardata++ = XCL_RANGE; + if (utf) /* Will always be utf in the 8-bit library */ + { + class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); + class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); + } + else /* Can only happen for the 16-bit & 32-bit libraries */ + { +#if PCRE2_CODE_UNIT_WIDTH == 16 + *class_uchardata++ = 0x100; + *class_uchardata++ = 0xffffu; +#elif PCRE2_CODE_UNIT_WIDTH == 32 + *class_uchardata++ = 0x100; + *class_uchardata++ = 0xffffffffu; +#endif + } + } + *class_uchardata++ = XCL_END; /* Marks the end of extra data */ + *code++ = OP_XCLASS; + code += LINK_SIZE; + *code = negate_class? XCL_NOT:0; + if (xclass_has_prop) *code |= XCL_HASPROP; + + /* If the map is required, move up the extra data to make room for it; + otherwise just move the code pointer to the end of the extra data. */ + + if (class_has_8bitchar > 0) + { + *code++ |= XCL_MAP; + (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, + CU2BYTES(class_uchardata - code)); + if (negate_class && !xclass_has_prop) + { + /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ + for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i]; + } + memcpy(code, classbits, 32); + code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); + } + else code = class_uchardata; + + /* Now fill in the complete length of the item */ + + PUT(previous, 1, (int)(code - previous)); + break; /* End of class handling */ + } +#endif /* SUPPORT_WIDE_CHARS */ + + /* If there are no characters > 255, or they are all to be included or + excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the + whole class was negated and whether there were negative specials such as \S + (non-UCP) in the class. Then copy the 32-byte map into the code vector, + negating it if necessary. */ + + *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; + if (lengthptr == NULL) /* Save time in the pre-compile phase */ + { + if (negate_class) + { + /* Using 255 ^ instead of ~ avoids clang sanitize warning. */ + for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i]; + } + memcpy(code, classbits, 32); + } + code += 32 / sizeof(PCRE2_UCHAR); + break; /* End of class processing */ + + + /* ===================================================================*/ + /* Deal with (*VERB)s. */ + + /* Check for open captures before ACCEPT and close those that are within + the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an + assertion. In the first pass, just accumulate the length required; + otherwise hitting (*ACCEPT) inside many nested parentheses can cause + workspace overflow. Do not set firstcu after *ACCEPT. */ + + case META_ACCEPT: + cb->had_accept = had_accept = TRUE; + for (oc = cb->open_caps; + oc != NULL && oc->assert_depth >= cb->assert_depth; + oc = oc->next) + { + if (lengthptr != NULL) + { + *lengthptr += CU2BYTES(1) + IMM2_SIZE; + } + else + { + *code++ = OP_CLOSE; + PUT2INC(code, 0, oc->number); + } + } + *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + break; + + case META_PRUNE: + case META_SKIP: + cb->had_pruneorskip = TRUE; + /* Fall through */ + case META_COMMIT: + case META_FAIL: + *code++ = verbops[(meta - META_MARK) >> 16]; + break; + + case META_THEN: + cb->external_flags |= PCRE2_HASTHEN; + *code++ = OP_THEN; + break; + + /* Handle verbs with arguments. Arguments can be very long, especially in + 16- and 32-bit modes, and can overflow the workspace in the first pass. + However, the argument length is constrained to be small enough to fit in + one code unit. This check happens in parse_regex(). In the first pass, + instead of putting the argument into memory, we just update the length + counter and set up an empty argument. */ + + case META_THEN_ARG: + cb->external_flags |= PCRE2_HASTHEN; + goto VERB_ARG; + + case META_PRUNE_ARG: + case META_SKIP_ARG: + cb->had_pruneorskip = TRUE; + /* Fall through */ + case META_MARK: + case META_COMMIT_ARG: + VERB_ARG: + *code++ = verbops[(meta - META_MARK) >> 16]; + /* The length is in characters. */ + verbarglen = *(++pptr); + verbculen = 0; + tempcode = code++; + for (int i = 0; i < (int)verbarglen; i++) + { + meta = *(++pptr); +#ifdef SUPPORT_UNICODE + if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else +#endif + { + mclength = 1; + mcbuffer[0] = meta; + } + if (lengthptr != NULL) *lengthptr += mclength; else + { + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + verbculen += mclength; + } + } + + *tempcode = verbculen; /* Fill in the code unit length */ + *code++ = 0; /* Terminating zero */ + break; + + + /* ===================================================================*/ + /* Handle options change. The new setting must be passed back for use in + subsequent branches. Reset the greedy defaults and the case value for + firstcu and reqcu. */ + + case META_OPTIONS: + *optionsptr = options = *(++pptr); + *xoptionsptr = xoptions = *(++pptr); + greedy_default = ((options & PCRE2_UNGREEDY) != 0); + greedy_non_default = greedy_default ^ 1; + req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; + break; + + + /* ===================================================================*/ + /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous + because it could be a numerical check on recursion, or a name check on a + group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that + we can handle it either way. We first try for a name; if not found, process + the number. */ + + case META_COND_RNUMBER: /* (?(Rdigits) */ + case META_COND_NAME: /* (?(name) or (?'name') or ?() */ + case META_COND_RNAME: /* (?(R&name) - test for recursion */ + bravalue = OP_COND; + { + int count, index; + unsigned int i; + PCRE2_SPTR name; + named_group *ng = cb->named_groups; + uint32_t length = *(++pptr); + + GETPLUSOFFSET(offset, pptr); + name = cb->start_pattern + offset; + + /* In the first pass, the names generated in the pre-pass are available, + but the main name table has not yet been created. Scan the list of names + generated in the pre-pass in order to get a number and whether or not + this name is duplicated. If it is not duplicated, we can handle it as a + numerical group. */ + + for (i = 0; i < cb->names_found; i++, ng++) + { + if (length == ng->length && + PRIV(strncmp)(name, ng->name, length) == 0) + { + if (!ng->isdup) + { + code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; + PUT2(code, 2+LINK_SIZE, ng->number); + if (ng->number > cb->top_backref) cb->top_backref = ng->number; + skipunits = 1+IMM2_SIZE; + goto GROUP_PROCESS_NOTE_EMPTY; + } + break; /* Found a duplicated name */ + } + } + + /* If the name was not found we have a bad reference, unless we are + dealing with R, which is treated as a recursion test by number. + */ + + if (i >= cb->names_found) + { + groupnumber = 0; + if (meta == META_COND_RNUMBER) + { + for (i = 1; i < length; i++) + { + groupnumber = groupnumber * 10 + name[i] - CHAR_0; + if (groupnumber > MAX_GROUP_NUMBER) + { + *errorcodeptr = ERR61; + cb->erroroffset = offset + i; + return 0; + } + } + } + + if (meta != META_COND_RNUMBER || groupnumber > cb->bracount) + { + *errorcodeptr = ERR15; + cb->erroroffset = offset; + return 0; + } + + /* (?Rdigits) treated as a recursion reference by number. A value of + zero (which is the result of both (?R) and (?R0)) means "any", and is + translated into RREF_ANY (which is 0xffff). */ + + if (groupnumber == 0) groupnumber = RREF_ANY; + code[1+LINK_SIZE] = OP_RREF; + PUT2(code, 2+LINK_SIZE, groupnumber); + skipunits = 1+IMM2_SIZE; + goto GROUP_PROCESS_NOTE_EMPTY; + } + + /* A duplicated name was found. Note that if an R name is found + (META_COND_RNUMBER), it is a reference test, not a recursion test. */ + + code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; + + /* We have a duplicated name. In the compile pass we have to search the + main table in order to get the index and count values. */ + + count = 0; /* Values for first pass (avoids compiler warning) */ + index = 0; + if (lengthptr == NULL && !find_dupname_details(name, length, &index, + &count, errorcodeptr, cb)) return 0; + + /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and + insert appropriate data values. */ + + code[1+LINK_SIZE]++; + skipunits = 1+2*IMM2_SIZE; + PUT2(code, 2+LINK_SIZE, index); + PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); + } + goto GROUP_PROCESS_NOTE_EMPTY; + + /* The DEFINE condition is always false. Its internal groups may never + be called, so matched_char must remain false, hence the jump to + GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */ + + case META_COND_DEFINE: + bravalue = OP_COND; + GETPLUSOFFSET(offset, pptr); + code[1+LINK_SIZE] = OP_DEFINE; + skipunits = 1; + goto GROUP_PROCESS; + + /* Conditional test of a group's being set. */ + + case META_COND_NUMBER: + bravalue = OP_COND; + GETPLUSOFFSET(offset, pptr); + groupnumber = *(++pptr); + if (groupnumber > cb->bracount) + { + *errorcodeptr = ERR15; + cb->erroroffset = offset; + return 0; + } + if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; + offset -= 2; /* Point at initial ( for too many branches error */ + code[1+LINK_SIZE] = OP_CREF; + skipunits = 1+IMM2_SIZE; + PUT2(code, 2+LINK_SIZE, groupnumber); + goto GROUP_PROCESS_NOTE_EMPTY; + + /* Test for the PCRE2 version. */ + + case META_COND_VERSION: + bravalue = OP_COND; + if (pptr[1] > 0) + code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) || + (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))? + OP_TRUE : OP_FALSE; + else + code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])? + OP_TRUE : OP_FALSE; + skipunits = 1; + pptr += 3; + goto GROUP_PROCESS_NOTE_EMPTY; + + /* The condition is an assertion, possibly preceded by a callout. */ + + case META_COND_ASSERT: + bravalue = OP_COND; + goto GROUP_PROCESS_NOTE_EMPTY; + + + /* ===================================================================*/ + /* Handle all kinds of nested bracketed groups. The non-capturing, + non-conditional cases are here; others come to GROUP_PROCESS via goto. */ + + case META_LOOKAHEAD: + bravalue = OP_ASSERT; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + case META_LOOKAHEAD_NA: + bravalue = OP_ASSERT_NA; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird + thing to do, but Perl allows all assertions to be quantified, and when + they contain capturing parentheses there may be a potential use for + this feature. Not that that applies to a quantified (?!) but we allow + it for uniformity. */ + + case META_LOOKAHEADNOT: + if (pptr[1] == META_KET && + (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY)) + { + *code++ = OP_FAIL; + pptr++; + } + else + { + bravalue = OP_ASSERT_NOT; + cb->assert_depth += 1; + goto GROUP_PROCESS; + } + break; + + case META_LOOKBEHIND: + bravalue = OP_ASSERTBACK; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + case META_LOOKBEHINDNOT: + bravalue = OP_ASSERTBACK_NOT; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + case META_LOOKBEHIND_NA: + bravalue = OP_ASSERTBACK_NA; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + case META_ATOMIC: + bravalue = OP_ONCE; + goto GROUP_PROCESS_NOTE_EMPTY; + + case META_SCRIPT_RUN: + bravalue = OP_SCRIPT_RUN; + goto GROUP_PROCESS_NOTE_EMPTY; + + case META_NOCAPTURE: + bravalue = OP_BRA; + /* Fall through */ + + /* Process nested bracketed regex. The nesting depth is maintained for the + benefit of the stackguard function. The test for too deep nesting is now + done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS; + others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take + note of whether or not they may match an empty string. */ + + GROUP_PROCESS_NOTE_EMPTY: + note_group_empty = TRUE; + + GROUP_PROCESS: + cb->parens_depth += 1; + *code = bravalue; + pptr++; + tempcode = code; + tempreqvary = cb->req_varyopt; /* Save value before group */ + length_prevgroup = 0; /* Initialize for pre-compile phase */ + + if ((group_return = + compile_regex( + options, /* The options state */ + xoptions, /* The extra options state */ + &tempcode, /* Where to put code (updated) */ + &pptr, /* Input pointer (updated) */ + errorcodeptr, /* Where to put an error message */ + skipunits, /* Skip over bracket number */ + &subfirstcu, /* For possible first char */ + &subfirstcuflags, + &subreqcu, /* For possible last char */ + &subreqcuflags, + bcptr, /* Current branch chain */ + cb, /* Compile data block */ + (lengthptr == NULL)? NULL : /* Actual compile phase */ + &length_prevgroup /* Pre-compile phase */ + )) == 0) + return 0; /* Error */ + + cb->parens_depth -= 1; + + /* If that was a non-conditional significant group (not an assertion, not a + DEFINE) that matches at least one character, then the current item matches + a character. Conditionals are handled below. */ + + if (note_group_empty && bravalue != OP_COND && group_return > 0) + matched_char = TRUE; + + /* If we've just compiled an assertion, pop the assert depth. */ + + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA) + cb->assert_depth -= 1; + + /* At the end of compiling, code is still pointing to the start of the + group, while tempcode has been updated to point past the end of the group. + The parsed pattern pointer (pptr) is on the closing META_KET. + + If this is a conditional bracket, check that there are no more than + two branches in the group, or just one if it's a DEFINE group. We do this + in the real compile phase, not in the pre-pass, where the whole group may + not be available. */ + + if (bravalue == OP_COND && lengthptr == NULL) + { + PCRE2_UCHAR *tc = code; + int condcount = 0; + + do { + condcount++; + tc += GET(tc,1); + } + while (*tc != OP_KET); + + /* A DEFINE group is never obeyed inline (the "condition" is always + false). It must have only one branch. Having checked this, change the + opcode to OP_FALSE. */ + + if (code[LINK_SIZE+1] == OP_DEFINE) + { + if (condcount > 1) + { + cb->erroroffset = offset; + *errorcodeptr = ERR54; + return 0; + } + code[LINK_SIZE+1] = OP_FALSE; + bravalue = OP_DEFINE; /* A flag to suppress char handling below */ + } + + /* A "normal" conditional group. If there is just one branch, we must not + make use of its firstcu or reqcu, because this is equivalent to an + empty second branch. Also, it may match an empty string. If there are two + branches, this item must match a character if the group must. */ + + else + { + if (condcount > 2) + { + cb->erroroffset = offset; + *errorcodeptr = ERR27; + return 0; + } + if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; + else if (group_return > 0) matched_char = TRUE; + } + } + + /* In the pre-compile phase, update the length by the length of the group, + less the brackets at either end. Then reduce the compiled code to just a + set of non-capturing brackets so that it doesn't use much memory if it is + duplicated by a quantifier.*/ + + if (lengthptr != NULL) + { + if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; + code++; /* This already contains bravalue */ + PUTINC(code, 0, 1 + LINK_SIZE); + *code++ = OP_KET; + PUTINC(code, 0, 1 + LINK_SIZE); + break; /* No need to waste time with special character handling */ + } + + /* Otherwise update the main code pointer to the end of the group. */ + + code = tempcode; + + /* For a DEFINE group, required and first character settings are not + relevant. */ + + if (bravalue == OP_DEFINE) break; + + /* Handle updating of the required and first code units for other types of + group. Update for normal brackets of all kinds, and conditions with two + branches (see code above). If the bracket is followed by a quantifier with + zero repeat, we have to back off. Hence the definition of zeroreqcu and + zerofirstcu outside the main loop so that they can be accessed for the back + off. */ + + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + groupsetfirstcu = FALSE; + + if (bravalue >= OP_ONCE) /* Not an assertion */ + { + /* If we have not yet set a firstcu in this branch, take it from the + subpattern, remembering that it was set here so that a repeat of more + than one can replicate it as reqcu if necessary. If the subpattern has + no firstcu, set "none" for the whole branch. In both cases, a zero + repeat forces firstcu to "none". */ + + if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) + { + if (subfirstcuflags < REQ_NONE) + { + firstcu = subfirstcu; + firstcuflags = subfirstcuflags; + groupsetfirstcu = TRUE; + } + else firstcuflags = REQ_NONE; + zerofirstcuflags = REQ_NONE; + } + + /* If firstcu was previously set, convert the subpattern's firstcu + into reqcu if there wasn't one, using the vary flag that was in + existence beforehand. */ + + else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE) + { + subreqcu = subfirstcu; + subreqcuflags = subfirstcuflags | tempreqvary; + } + + /* If the subpattern set a required code unit (or set a first code unit + that isn't really the first code unit - see above), set it. */ + + if (subreqcuflags < REQ_NONE) + { + reqcu = subreqcu; + reqcuflags = subreqcuflags; + } + } + + /* For a forward assertion, we take the reqcu, if set, provided that the + group has also set a firstcu. This can be helpful if the pattern that + follows the assertion doesn't set a different char. For example, it's + useful for /(?=abcde).+/. We can't set firstcu for an assertion, however + because it leads to incorrect effect for patterns such as /(?=a)a.+/ when + the "real" "a" would then become a reqcu instead of a firstcu. This is + overcome by a scan at the end if there's no firstcu, looking for an + asserted first char. A similar effect for patterns like /(?=.*X)X$/ means + we must only take the reqcu when the group also set a firstcu. Otherwise, + in that example, 'X' ends up set for both. */ + + else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) && + subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE) + { + reqcu = subreqcu; + reqcuflags = subreqcuflags; + } + + break; /* End of nested group handling */ + + + /* ===================================================================*/ + /* Handle named backreferences and recursions. */ + + case META_BACKREF_BYNAME: + case META_RECURSE_BYNAME: + { + int count, index; + PCRE2_SPTR name; + BOOL is_dupname = FALSE; + named_group *ng = cb->named_groups; + uint32_t length = *(++pptr); + + GETPLUSOFFSET(offset, pptr); + name = cb->start_pattern + offset; + + /* In the first pass, the names generated in the pre-pass are available, + but the main name table has not yet been created. Scan the list of names + generated in the pre-pass in order to get a number and whether or not + this name is duplicated. */ + + groupnumber = 0; + for (unsigned int i = 0; i < cb->names_found; i++, ng++) + { + if (length == ng->length && + PRIV(strncmp)(name, ng->name, length) == 0) + { + is_dupname = ng->isdup; + groupnumber = ng->number; + + /* For a recursion, that's all that is needed. We can now go to + the code that handles numerical recursion, applying it to the first + group with the given name. */ + + if (meta == META_RECURSE_BYNAME) + { + meta_arg = groupnumber; + goto HANDLE_NUMERICAL_RECURSION; + } + + /* For a back reference, update the back reference map and the + maximum back reference. */ + + cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; + if (groupnumber > cb->top_backref) + cb->top_backref = groupnumber; + } + } + + /* If the name was not found we have a bad reference. */ + + if (groupnumber == 0) + { + *errorcodeptr = ERR15; + cb->erroroffset = offset; + return 0; + } + + /* If a back reference name is not duplicated, we can handle it as + a numerical reference. */ + + if (!is_dupname) + { + meta_arg = groupnumber; + goto HANDLE_SINGLE_REFERENCE; + } + + /* If a back reference name is duplicated, we generate a different + opcode to a numerical back reference. In the second pass we must + search for the index and count in the final name table. */ + + count = 0; /* Values for first pass (avoids compiler warning) */ + index = 0; + if (lengthptr == NULL && !find_dupname_details(name, length, &index, + &count, errorcodeptr, cb)) return 0; + + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; + PUT2INC(code, 0, index); + PUT2INC(code, 0, count); + } + break; + + + /* ===================================================================*/ + /* Handle a numerical callout. */ + + case META_CALLOUT_NUMBER: + code[0] = OP_CALLOUT; + PUT(code, 1, pptr[1]); /* Offset to next pattern item */ + PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ + code[1 + 2*LINK_SIZE] = pptr[3]; + pptr += 3; + code += PRIV(OP_lengths)[OP_CALLOUT]; + break; + + + /* ===================================================================*/ + /* Handle a callout with a string argument. In the pre-pass we just compute + the length without generating anything. The length in pptr[3] includes both + delimiters; in the actual compile only the first one is copied, but a + terminating zero is added. Any doubled delimiters within the string make + this an overestimate, but it is not worth bothering about. */ + + case META_CALLOUT_STRING: + if (lengthptr != NULL) + { + *lengthptr += pptr[3] + (1 + 4*LINK_SIZE); + pptr += 3; + SKIPOFFSET(pptr); + } + + /* In the real compile we can copy the string. The starting delimiter is + included so that the client can discover it if they want. We also pass the + start offset to help a script language give better error messages. */ + + else + { + PCRE2_SPTR pp; + uint32_t delimiter; + uint32_t length = pptr[3]; + PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); + + code[0] = OP_CALLOUT_STR; + PUT(code, 1, pptr[1]); /* Offset to next pattern item */ + PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ + + pptr += 3; + GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */ + pp = cb->start_pattern + offset; + delimiter = *callout_string++ = *pp++; + if (delimiter == CHAR_LEFT_CURLY_BRACKET) + delimiter = CHAR_RIGHT_CURLY_BRACKET; + PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */ + + /* The syntax of the pattern was checked in the parsing scan. The length + includes both delimiters, but we have passed the opening one just above, + so we reduce length before testing it. The test is for > 1 because we do + not want to copy the final delimiter. This also ensures that pp[1] is + accessible. */ + + while (--length > 1) + { + if (*pp == delimiter && pp[1] == delimiter) + { + *callout_string++ = delimiter; + pp += 2; + length--; + } + else *callout_string++ = *pp++; + } + *callout_string++ = CHAR_NUL; + + /* Set the length of the entire item, the advance to its end. */ + + PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code)); + code = callout_string; + } + break; + + + /* ===================================================================*/ + /* Handle repetition. The different types are all sorted out in the parsing + pass. */ + + case META_MINMAX_PLUS: + case META_MINMAX_QUERY: + case META_MINMAX: + repeat_min = *(++pptr); + repeat_max = *(++pptr); + goto REPEAT; + + case META_ASTERISK: + case META_ASTERISK_PLUS: + case META_ASTERISK_QUERY: + repeat_min = 0; + repeat_max = REPEAT_UNLIMITED; + goto REPEAT; + + case META_PLUS: + case META_PLUS_PLUS: + case META_PLUS_QUERY: + repeat_min = 1; + repeat_max = REPEAT_UNLIMITED; + goto REPEAT; + + case META_QUERY: + case META_QUERY_PLUS: + case META_QUERY_QUERY: + repeat_min = 0; + repeat_max = 1; + + REPEAT: + if (previous_matched_char && repeat_min > 0) matched_char = TRUE; + + /* Remember whether this is a variable length repeat, and default to + single-char opcodes. */ + + reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; + op_type = 0; + + /* Adjust first and required code units for a zero repeat. */ + + if (repeat_min == 0) + { + firstcu = zerofirstcu; + firstcuflags = zerofirstcuflags; + reqcu = zeroreqcu; + reqcuflags = zeroreqcuflags; + } + + /* Note the greediness and possessiveness. */ + + switch (meta) + { + case META_MINMAX_PLUS: + case META_ASTERISK_PLUS: + case META_PLUS_PLUS: + case META_QUERY_PLUS: + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + break; + + case META_MINMAX_QUERY: + case META_ASTERISK_QUERY: + case META_PLUS_QUERY: + case META_QUERY_QUERY: + repeat_type = greedy_non_default; + possessive_quantifier = FALSE; + break; + + default: + repeat_type = greedy_default; + possessive_quantifier = FALSE; + break; + } + + /* Save start of previous item, in case we have to move it up in order to + insert something before it, and remember what it was. */ + + tempcode = previous; + op_previous = *previous; + + /* Now handle repetition for the different types of item. If the repeat + minimum and the repeat maximum are both 1, we can ignore the quantifier for + non-parenthesized items, as they have only one alternative. For anything in + parentheses, we must not ignore if {1} is possessive. */ + + switch (op_previous) + { + /* If previous was a character or negated character match, abolish the + item and generate a repeat item instead. If a char item has a minimum of + more than one, ensure that it is set in reqcu - it might not be if a + sequence such as x{3} is the first thing in a branch because the x will + have gone into firstcu instead. */ + + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; + op_type = chartypeoffset[op_previous - OP_CHAR]; + + /* Deal with UTF characters that take up more than one code unit. */ + +#ifdef MAYBE_UTF_MULTI + if (utf && NOT_FIRSTCU(code[-1])) + { + PCRE2_UCHAR *lastchar = code - 1; + BACKCHAR(lastchar); + mclength = (uint32_t)(code - lastchar); /* Length of UTF character */ + memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */ + } + else +#endif /* MAYBE_UTF_MULTI */ + + /* Handle the case of a single code unit - either with no UTF support, or + with UTF disabled, or for a single-code-unit UTF character. In the latter + case, for a repeated positive match, get the caseless flag for the + required code unit from the previous character, because a class like [Aa] + sets a caseless A but by now the req_caseopt flag has been reset. */ + + { + mcbuffer[0] = code[-1]; + mclength = 1; + if (op_previous <= OP_CHARI && repeat_min > 1) + { + reqcu = mcbuffer[0]; + reqcuflags = cb->req_varyopt; + if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS; + } + } + goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ + + /* If previous was a character class or a back reference, we put the + repeat stuff after it, but just skip the item if the repeat was {0,0}. */ + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: +#endif + case OP_CLASS: + case OP_NCLASS: + case OP_REF: + case OP_REFI: + case OP_DNREF: + case OP_DNREFI: + + if (repeat_max == 0) + { + code = previous; + goto END_REPEAT; + } + if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; + + if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED) + *code++ = OP_CRSTAR + repeat_type; + else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED) + *code++ = OP_CRPLUS + repeat_type; + else if (repeat_min == 0 && repeat_max == 1) + *code++ = OP_CRQUERY + repeat_type; + else + { + *code++ = OP_CRRANGE + repeat_type; + PUT2INC(code, 0, repeat_min); + if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */ + PUT2INC(code, 0, repeat_max); + } + break; + + /* If previous is OP_FAIL, it was generated by an empty class [] + (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be + generated, that is by (*FAIL) or (?!), disallow a quantifier at parse + time. We can just ignore this repeat. */ + + case OP_FAIL: + goto END_REPEAT; + + /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets + because pcre2_match() could not handle backtracking into recursively + called groups. Now that this backtracking is available, we no longer need + to do this. However, we still need to replicate recursions as we do for + groups so as to have independent backtracking points. We can replicate + for the minimum number of repeats directly. For optional repeats we now + wrap the recursion in OP_BRA brackets and make use of the bracket + repetition. */ + + case OP_RECURSE: + if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier) + goto END_REPEAT; + + /* Generate unwrapped repeats for a non-zero minimum, except when the + minimum is 1 and the maximum unlimited, because that can be handled with + OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the + minimum, we just need to generate the appropriate additional copies. + Otherwise we need to generate one more, to simulate the situation when + the minimum is zero. */ + + if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED)) + { + int replicate = repeat_min; + if (repeat_min == repeat_max) replicate--; + + /* In the pre-compile phase, we don't actually do the replication. We + just adjust the length as if we had. Do some paranoid checks for + potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit + integer type when available, otherwise double. */ + + if (lengthptr != NULL) + { + PCRE2_SIZE delta = replicate*(1 + LINK_SIZE); + if ((INT64_OR_DOUBLE)replicate* + (INT64_OR_DOUBLE)(1 + LINK_SIZE) > + (INT64_OR_DOUBLE)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += delta; + } + + else for (int i = 0; i < replicate; i++) + { + memcpy(code, previous, CU2BYTES(1 + LINK_SIZE)); + previous = code; + code += 1 + LINK_SIZE; + } + + /* If the number of repeats is fixed, we are done. Otherwise, adjust + the counts and fall through. */ + + if (repeat_min == repeat_max) break; + if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; + repeat_min = 0; + } + + /* Wrap the recursion call in OP_BRA brackets. */ + + (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); + op_previous = *previous = OP_BRA; + PUT(previous, 1, 2 + 2*LINK_SIZE); + previous[2 + 2*LINK_SIZE] = OP_KET; + PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); + code += 2 + 2 * LINK_SIZE; + length_prevgroup = 3 + 3*LINK_SIZE; + group_return = -1; /* Set "may match empty string" */ + + /* Now treat as a repeated OP_BRA. */ + /* Fall through */ + + /* If previous was a bracket group, we may have to replicate it in + certain cases. Note that at this point we can encounter only the "basic" + bracket opcodes such as BRA and CBRA, as this is the place where they get + converted into the more special varieties such as BRAPOS and SBRA. + Originally, PCRE did not allow repetition of assertions, but now it does, + for Perl compatibility. */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERT_NA: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + case OP_ASSERTBACK_NA: + case OP_ONCE: + case OP_SCRIPT_RUN: + case OP_BRA: + case OP_CBRA: + case OP_COND: + { + int len = (int)(code - previous); + PCRE2_UCHAR *bralink = NULL; + PCRE2_UCHAR *brazeroptr = NULL; + + if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier) + goto END_REPEAT; + + /* Repeating a DEFINE group (or any group where the condition is always + FALSE and there is only one branch) is pointless, but Perl allows the + syntax, so we just ignore the repeat. */ + + if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && + previous[GET(previous, 1)] != OP_ALT) + goto END_REPEAT; + + /* Perl allows all assertions to be quantified, and when they contain + capturing parentheses and/or are optional there are potential uses for + this feature. PCRE2 used to force the maximum quantifier to 1 on the + invalid grounds that further repetition was never useful. This was + always a bit pointless, since an assertion could be wrapped with a + repeated group to achieve the effect. General repetition is now + permitted, but if the maximum is unlimited it is set to one more than + the minimum. */ + + if (op_previous < OP_ONCE) /* Assertion */ + { + if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1; + } + + /* The case of a zero minimum is special because of the need to stick + OP_BRAZERO in front of it, and because the group appears once in the + data, whereas in other cases it appears the minimum number of times. For + this reason, it is simplest to treat this case separately, as otherwise + the code gets far too messy. There are several special subcases when the + minimum is zero. */ + + if (repeat_min == 0) + { + /* If the maximum is also zero, we used to just omit the group from + the output altogether, like this: + + ** if (repeat_max == 0) + ** { + ** code = previous; + ** goto END_REPEAT; + ** } + + However, that fails when a group or a subgroup within it is + referenced as a subroutine from elsewhere in the pattern, so now we + stick in OP_SKIPZERO in front of it so that it is skipped on + execution. As we don't have a list of which groups are referenced, we + cannot do this selectively. + + If the maximum is 1 or unlimited, we just have to stick in the + BRAZERO and do no more at this point. */ + + if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED) + { + (void)memmove(previous + 1, previous, CU2BYTES(len)); + code++; + if (repeat_max == 0) + { + *previous++ = OP_SKIPZERO; + goto END_REPEAT; + } + brazeroptr = previous; /* Save for possessive optimizing */ + *previous++ = OP_BRAZERO + repeat_type; + } + + /* If the maximum is greater than 1 and limited, we have to replicate + in a nested fashion, sticking OP_BRAZERO before each set of brackets. + The first one has to be handled carefully because it's the original + copy, which has to be moved up. The remainder can be handled by code + that is common with the non-zero minimum case below. We have to + adjust the value or repeat_max, since one less copy is required. */ + + else + { + int linkoffset; + (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); + code += 2 + LINK_SIZE; + *previous++ = OP_BRAZERO + repeat_type; + *previous++ = OP_BRA; + + /* We chain together the bracket link offset fields that have to be + filled in later when the ends of the brackets are reached. */ + + linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink); + bralink = previous; + PUTINC(previous, 0, linkoffset); + } + + if (repeat_max != REPEAT_UNLIMITED) repeat_max--; + } + + /* If the minimum is greater than zero, replicate the group as many + times as necessary, and adjust the maximum to the number of subsequent + copies that we need. */ + + else + { + if (repeat_min > 1) + { + /* In the pre-compile phase, we don't actually do the replication. + We just adjust the length as if we had. Do some paranoid checks for + potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit + integer type when available, otherwise double. */ + + if (lengthptr != NULL) + { + PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup; + if ((INT64_OR_DOUBLE)(repeat_min - 1)* + (INT64_OR_DOUBLE)length_prevgroup > + (INT64_OR_DOUBLE)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += delta; + } + + /* This is compiling for real. If there is a set first code unit + for the group, and we have not yet set a "required code unit", set + it. */ + + else + { + if (groupsetfirstcu && reqcuflags >= REQ_NONE) + { + reqcu = firstcu; + reqcuflags = firstcuflags; + } + for (uint32_t i = 1; i < repeat_min; i++) + { + memcpy(code, previous, CU2BYTES(len)); + code += len; + } + } + } + + if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; + } + + /* This code is common to both the zero and non-zero minimum cases. If + the maximum is limited, it replicates the group in a nested fashion, + remembering the bracket starts on a stack. In the case of a zero + minimum, the first one was set up above. In all cases the repeat_max + now specifies the number of additional copies needed. Again, we must + remember to replicate entries on the forward reference list. */ + + if (repeat_max != REPEAT_UNLIMITED) + { + /* In the pre-compile phase, we don't actually do the replication. We + just adjust the length as if we had. For each repetition we must add + 1 to the length for BRAZERO and for all but the last repetition we + must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some + paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type + is a 64-bit integer type when available, otherwise double. */ + + if (lengthptr != NULL && repeat_max > 0) + { + PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - + 2 - 2*LINK_SIZE; /* Last one doesn't nest */ + if ((INT64_OR_DOUBLE)repeat_max * + (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) + > (INT64_OR_DOUBLE)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += delta; + } + + /* This is compiling for real */ + + else for (uint32_t i = repeat_max; i >= 1; i--) + { + *code++ = OP_BRAZERO + repeat_type; + + /* All but the final copy start a new nesting, maintaining the + chain of brackets outstanding. */ + + if (i != 1) + { + int linkoffset; + *code++ = OP_BRA; + linkoffset = (bralink == NULL)? 0 : (int)(code - bralink); + bralink = code; + PUTINC(code, 0, linkoffset); + } + + memcpy(code, previous, CU2BYTES(len)); + code += len; + } + + /* Now chain through the pending brackets, and fill in their length + fields (which are holding the chain links pro tem). */ + + while (bralink != NULL) + { + int oldlinkoffset; + int linkoffset = (int)(code - bralink + 1); + PCRE2_UCHAR *bra = code - linkoffset; + oldlinkoffset = GET(bra, 1); + bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; + *code++ = OP_KET; + PUTINC(code, 0, linkoffset); + PUT(bra, 1, linkoffset); + } + } + + /* If the maximum is unlimited, set a repeater in the final copy. For + SCRIPT_RUN and ONCE brackets, that's all we need to do. However, + possessively repeated ONCE brackets can be converted into non-capturing + brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this + saves having to deal with possessive ONCEs specially. + + Otherwise, when we are doing the actual compile phase, check to see + whether this group is one that could match an empty string. If so, + convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so + that runtime checking can be done. [This check is also applied to ONCE + and SCRIPT_RUN groups at runtime, but in a different way.] + + Then, if the quantifier was possessive and the bracket is not a + conditional, we convert the BRA code to the POS form, and the KET code + to KETRPOS. (It turns out to be convenient at runtime to detect this + kind of subpattern at both the start and at the end.) The use of + special opcodes makes it possible to reduce greatly the stack usage in + pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to + OP_BRAPOSZERO. + + Then, if the minimum number of matches is 1 or 0, cancel the possessive + flag so that the default action below, of wrapping everything inside + atomic brackets, does not happen. When the minimum is greater than 1, + there will be earlier copies of the group, and so we still have to wrap + the whole thing. */ + + else + { + PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; + PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); + + /* Convert possessive ONCE brackets to non-capturing */ + + if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; + + /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need + to do is to set the KET. */ + + if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN) + *ketcode = OP_KETRMAX + repeat_type; + + /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs + (which have been converted to non-capturing above). */ + + else + { + /* In the compile phase, adjust the opcode if the group can match + an empty string. For a conditional group with only one branch, the + value of group_return will not show "could be empty", so we must + check that separately. */ + + if (lengthptr == NULL) + { + if (group_return < 0) *bracode += OP_SBRA - OP_BRA; + if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) + *bracode = OP_SCOND; + } + + /* Handle possessive quantifiers. */ + + if (possessive_quantifier) + { + /* For COND brackets, we wrap the whole thing in a possessively + repeated non-capturing bracket, because we have not invented POS + versions of the COND opcodes. */ + + if (*bracode == OP_COND || *bracode == OP_SCOND) + { + int nlen = (int)(code - bracode); + (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); + code += 1 + LINK_SIZE; + nlen += 1 + LINK_SIZE; + *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; + *code++ = OP_KETRPOS; + PUTINC(code, 0, nlen); + PUT(bracode, 1, nlen); + } + + /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ + + else + { + *bracode += 1; /* Switch to xxxPOS opcodes */ + *ketcode = OP_KETRPOS; + } + + /* If the minimum is zero, mark it as possessive, then unset the + possessive flag when the minimum is 0 or 1. */ + + if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; + if (repeat_min < 2) possessive_quantifier = FALSE; + } + + /* Non-possessive quantifier */ + + else *ketcode = OP_KETRMAX + repeat_type; + } + } + } + break; + + /* If previous was a character type match (\d or similar), abolish it and + create a suitable repeat item. The code is shared with single-character + repeats by setting op_type to add a suitable offset into repeat_type. + Note the the Unicode property types will be present only when + SUPPORT_UNICODE is defined, but we don't wrap the little bits of code + here because it just makes it horribly messy. */ + + default: + if (op_previous >= OP_EODN) /* Not a character type - internal error */ + { + *errorcodeptr = ERR10; + return 0; + } + else + { + int prop_type, prop_value; + PCRE2_UCHAR *oldcode; + + if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; + + op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ + mclength = 0; /* Not a character */ + + if (op_previous == OP_PROP || op_previous == OP_NOTPROP) + { + prop_type = previous[1]; + prop_value = previous[2]; + } + else + { + /* Come here from just above with a character in mcbuffer/mclength. */ + OUTPUT_SINGLE_REPEAT: + prop_type = prop_value = -1; + } + + /* At this point, if prop_type == prop_value == -1 we either have a + character in mcbuffer when mclength is greater than zero, or we have + mclength zero, in which case there is a non-property character type in + op_previous. If prop_type/value are not negative, we have a property + character type in op_previous. */ + + oldcode = code; /* Save where we were */ + code = previous; /* Usually overwrite previous item */ + + /* If the maximum is zero then the minimum must also be zero; Perl allows + this case, so we do too - by simply omitting the item altogether. */ + + if (repeat_max == 0) goto END_REPEAT; + + /* Combine the op_type with the repeat_type */ + + repeat_type += op_type; + + /* A minimum of zero is handled either as the special case * or ?, or as + an UPTO, with the maximum given. */ + + if (repeat_min == 0) + { + if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type; + else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; + else + { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + + /* A repeat minimum of 1 is optimized into some special cases. If the + maximum is unlimited, we use OP_PLUS. Otherwise, the original item is + left in place and, if the maximum is greater than 1, we use OP_UPTO with + one less than the maximum. */ + + else if (repeat_min == 1) + { + if (repeat_max == REPEAT_UNLIMITED) + *code++ = OP_PLUS + repeat_type; + else + { + code = oldcode; /* Leave previous item in place */ + if (repeat_max == 1) goto END_REPEAT; + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max - 1); + } + } + + /* The case {n,n} is just an EXACT, while the general case {n,m} is + handled as an EXACT followed by an UPTO or STAR or QUERY. */ + + else + { + *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ + PUT2INC(code, 0, repeat_min); + + /* Unless repeat_max equals repeat_min, fill in the data for EXACT, + and then generate the second opcode. For a repeated Unicode property + match, there are two extra values that define the required property, + and mclength is set zero to indicate this. */ + + if (repeat_max != repeat_min) + { + if (mclength > 0) + { + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + } + else + { + *code++ = op_previous; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } + } + + /* Now set up the following opcode */ + + if (repeat_max == REPEAT_UNLIMITED) + *code++ = OP_STAR + repeat_type; + else + { + repeat_max -= repeat_min; + if (repeat_max == 1) + { + *code++ = OP_QUERY + repeat_type; + } + else + { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + } + } + + /* Fill in the character or character type for the final opcode. */ + + if (mclength > 0) + { + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + } + else + { + *code++ = op_previous; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } + } + } + break; + } /* End of switch on different op_previous values */ + + + /* If the character following a repeat is '+', possessive_quantifier is + TRUE. For some opcodes, there are special alternative opcodes for this + case. For anything else, we wrap the entire repeated item inside OP_ONCE + brackets. Logically, the '+' notation is just syntactic sugar, taken from + Sun's Java package, but the special opcodes can optimize it. + + Some (but not all) possessively repeated subpatterns have already been + completely handled in the code just above. For them, possessive_quantifier + is always FALSE at this stage. Note that the repeated item starts at + tempcode, not at previous, which might be the first part of a string whose + (former) last char we repeated. */ + + if (possessive_quantifier) + { + int len; + + /* Possessifying an EXACT quantifier has no effect, so we can ignore it. + However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, + {5,}, or {5,10}). We skip over an EXACT item; if the length of what + remains is greater than zero, there's a further opcode that can be + handled. If not, do nothing, leaving the EXACT alone. */ + + switch(*tempcode) + { + case OP_TYPEEXACT: + tempcode += PRIV(OP_lengths)[*tempcode] + + ((tempcode[1 + IMM2_SIZE] == OP_PROP + || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); + break; + + /* CHAR opcodes are used for exacts whose count is 1. */ + + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + tempcode += PRIV(OP_lengths)[*tempcode]; +#ifdef SUPPORT_UNICODE + if (utf && HAS_EXTRALEN(tempcode[-1])) + tempcode += GET_EXTRALEN(tempcode[-1]); +#endif + break; + + /* For the class opcodes, the repeat operator appears at the end; + adjust tempcode to point to it. */ + + case OP_CLASS: + case OP_NCLASS: + tempcode += 1 + 32/sizeof(PCRE2_UCHAR); + break; + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + tempcode += GET(tempcode, 1); + break; +#endif + } + + /* If tempcode is equal to code (which points to the end of the repeated + item), it means we have skipped an EXACT item but there is no following + QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In + all other cases, tempcode will be pointing to the repeat opcode, and will + be less than code, so the value of len will be greater than 0. */ + + len = (int)(code - tempcode); + if (len > 0) + { + unsigned int repcode = *tempcode; + + /* There is a table for possessifying opcodes, all of which are less + than OP_CALLOUT. A zero entry means there is no possessified version. + */ + + if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) + *tempcode = opcode_possessify[repcode]; + + /* For opcode without a special possessified version, wrap the item in + ONCE brackets. */ + + else + { + (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); + code += 1 + LINK_SIZE; + len += 1 + LINK_SIZE; + tempcode[0] = OP_ONCE; + *code++ = OP_KET; + PUTINC(code, 0, len); + PUT(tempcode, 1, len); + } + } + } + + /* We set the "follows varying string" flag for subsequently encountered + reqcus if it isn't already set and we have just passed a varying length + item. */ + + END_REPEAT: + cb->req_varyopt |= reqvary; + break; + + + /* ===================================================================*/ + /* Handle a 32-bit data character with a value greater than META_END. */ + + case META_BIGVALUE: + pptr++; + goto NORMAL_CHAR; + + + /* ===============================================================*/ + /* Handle a back reference by number, which is the meta argument. The + pattern offsets for back references to group numbers less than 10 are held + in a special vector, to avoid using more than two parsed pattern elements + in 64-bit environments. We only need the offset to the first occurrence, + because if that doesn't fail, subsequent ones will also be OK. */ + + case META_BACKREF: + if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg]; + else GETPLUSOFFSET(offset, pptr); + + if (meta_arg > cb->bracount) + { + cb->erroroffset = offset; + *errorcodeptr = ERR15; /* Non-existent subpattern */ + return 0; + } + + /* Come here from named backref handling when the reference is to a + single group (that is, not to a duplicated name). The back reference + data will have already been updated. We must disable firstcu if not + set, to cope with cases like (?=(\w+))\1: which would otherwise set ':' + later. */ + + HANDLE_SINGLE_REFERENCE: + if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE; + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; + PUT2INC(code, 0, meta_arg); + + /* Update the map of back references, and keep the highest one. We + could do this in parse_regex() for numerical back references, but not + for named back references, because we don't know the numbers to which + named back references refer. So we do it all in this function. */ + + cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; + if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; + break; + + + /* ===============================================================*/ + /* Handle recursion by inserting the number of the called group (which is + the meta argument) after OP_RECURSE. At the end of compiling the pattern is + scanned and these numbers are replaced by offsets within the pattern. It is + done like this to avoid problems with forward references and adjusting + offsets when groups are duplicated and moved (as discovered in previous + implementations). Note that a recursion does not have a set first + character. */ + + case META_RECURSE: + GETPLUSOFFSET(offset, pptr); + if (meta_arg > cb->bracount) + { + cb->erroroffset = offset; + *errorcodeptr = ERR15; /* Non-existent subpattern */ + return 0; + } + HANDLE_NUMERICAL_RECURSION: + *code = OP_RECURSE; + PUT(code, 1, meta_arg); + code += 1 + LINK_SIZE; + groupsetfirstcu = FALSE; + cb->had_recurse = TRUE; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + break; + + + /* ===============================================================*/ + /* Handle capturing parentheses; the number is the meta argument. */ + + case META_CAPTURE: + bravalue = OP_CBRA; + skipunits = IMM2_SIZE; + PUT2(code, 1+LINK_SIZE, meta_arg); + cb->lastcapture = meta_arg; + goto GROUP_PROCESS_NOTE_EMPTY; + + + /* ===============================================================*/ + /* Handle escape sequence items. For ones like \d, the ESC_values are + arranged to be the same as the corresponding OP_values in the default case + when PCRE2_UCP is not set (which is the only case in which they will appear + here). + + Note: \Q and \E are never seen here, as they were dealt with in + parse_pattern(). Neither are numerical back references or recursions, which + were turned into META_BACKREF or META_RECURSE items, respectively. \k and + \g, when followed by names, are turned into META_BACKREF_BYNAME or + META_RECURSE_BYNAME. */ + + case META_ESCAPE: + + /* We can test for escape sequences that consume a character because their + values lie between ESC_b and ESC_Z; this may have to change if any new ones + are ever created. For these sequences, we disable the setting of a first + character if it hasn't already been set. */ + + if (meta_arg > ESC_b && meta_arg < ESC_Z) + { + matched_char = TRUE; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + } + + /* Set values to reset to if this is followed by a zero repeat. */ + + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + + /* If Unicode is not supported, \P and \p are not allowed and are + faulted at parse time, so will never appear here. */ + +#ifdef SUPPORT_UNICODE + if (meta_arg == ESC_P || meta_arg == ESC_p) + { + uint32_t ptype = *(++pptr) >> 16; + uint32_t pdata = *pptr & 0xffff; + + /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit + from the auto-anchoring code. */ + + if (meta_arg == ESC_p && ptype == PT_ANY) + { + *code++ = OP_ALLANY; + } + else + { + *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; + } + break; /* End META_ESCAPE */ + } +#endif + + /* \K is forbidden in lookarounds since 10.38 because that's what Perl has + done. However, there's an option, in case anyone was relying on it. */ + + if (cb->assert_depth > 0 && meta_arg == ESC_K && + (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0) + { + *errorcodeptr = ERR99; + return 0; + } + + /* For the rest (including \X when Unicode is supported - if not it's + faulted at parse time), the OP value is the escape value when PCRE2_UCP is + not set; if it is set, these escapes do not show up here because they are + converted into Unicode property tests in parse_regex(). Note that \b and \B + do a one-character lookbehind, and \A also behaves as if it does. */ + + if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */ + if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) && + cb->max_lookbehind == 0) + cb->max_lookbehind = 1; + + /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY + instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 + *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg; +#else + *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg; +#endif + break; /* End META_ESCAPE */ + + + /* ===================================================================*/ + /* Handle an unrecognized meta value. A parsed pattern value less than + META_END is a literal. Otherwise we have a problem. */ + + default: + if (meta >= META_END) + { +#ifdef DEBUG_SHOW_PARSED + fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr); +#endif + *errorcodeptr = ERR89; /* Internal error - unrecognized. */ + return 0; + } + + /* Handle a literal character. We come here by goto in the case of a + 32-bit, non-UTF character whose value is greater than META_END. */ + + NORMAL_CHAR: + meta = *pptr; /* Get the full 32 bits */ + NORMAL_CHAR_SET: /* Character is already in meta */ + matched_char = TRUE; + + /* For caseless UTF or UCP mode, check whether this character has more than + one other case. If so, generate a special OP_PROP item instead of OP_CHARI. + When casing restrictions apply, ignore caseless sets that start with an + ASCII character. */ + +#ifdef SUPPORT_UNICODE + if ((utf||ucp) && (options & PCRE2_CASELESS) != 0) + { + uint32_t caseset = UCD_CASESET(meta); + if (caseset != 0 && + ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 || + PRIV(ucd_caseless_sets)[caseset] > 127)) + { + *code++ = OP_PROP; + *code++ = PT_CLIST; + *code++ = caseset; + if (firstcuflags == REQ_UNSET) + firstcuflags = zerofirstcuflags = REQ_NONE; + break; /* End handling this meta item */ + } + } +#endif + + /* Caseful matches, or caseless and not one of the multicase characters. We + come here by goto in the case of a positive class that contains only + case-partners of a character with just two cases; matched_char has already + been set TRUE and options fudged if necessary. */ + + CLASS_CASELESS_CHAR: + + /* Get the character's code units into mcbuffer, with the length in + mclength. When not in UTF mode, the length is always 1. */ + +#ifdef SUPPORT_UNICODE + if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else +#endif + { + mclength = 1; + mcbuffer[0] = meta; + } + + /* Generate the appropriate code */ + + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + + /* Remember if \r or \n were seen */ + + if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) + cb->external_flags |= PCRE2_HASCRORLF; + + /* Set the first and required code units appropriately. If no previous + first code unit, set it from this character, but revert to none on a zero + repeat. Otherwise, leave the firstcu value alone, and don't change it on + a zero repeat. */ + + if (firstcuflags == REQ_UNSET) + { + zerofirstcuflags = REQ_NONE; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + + /* If the character is more than one code unit long, we can set a single + firstcu only if it is not to be matched caselessly. Multiple possible + starting code units may be picked up later in the studying code. */ + + if (mclength == 1 || req_caseopt == 0) + { + firstcu = mcbuffer[0]; + firstcuflags = req_caseopt; + if (mclength != 1) + { + reqcu = code[-1]; + reqcuflags = cb->req_varyopt; + } + } + else firstcuflags = reqcuflags = REQ_NONE; + } + + /* firstcu was previously set; we can set reqcu only if the length is + 1 or the matching is caseful. */ + + else + { + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + if (mclength == 1 || req_caseopt == 0) + { + reqcu = code[-1]; + reqcuflags = req_caseopt | cb->req_varyopt; + } + } + + /* If caselessness was temporarily instated, reset it. */ + + if (reset_caseful) + { + options &= ~PCRE2_CASELESS; + req_caseopt = 0; + reset_caseful = FALSE; + } + + break; /* End literal character handling */ + } /* End of big switch */ + } /* End of big loop */ + +/* Control never reaches here. */ +} + + + +/************************************************* +* Compile regex: a sequence of alternatives * +*************************************************/ + +/* On entry, pptr is pointing past the bracket meta, but on return it points to +the closing bracket or META_END. The code variable is pointing at the code unit +into which the BRA operator has been stored. This function is used during the +pre-compile phase when we are trying to find out the amount of memory needed, +as well as during the real compile phase. The value of lengthptr distinguishes +the two phases. + +Arguments: + options option bits, including any changes for this subpattern + xoptions extra option bits, ditto + codeptr -> the address of the current code pointer + pptrptr -> the address of the current parsed pattern pointer + errorcodeptr -> pointer to error code variable + skipunits skip this many code units at start (for brackets and OP_COND) + firstcuptr place to put the first required code unit + firstcuflagsptr place to put the first code unit flags + reqcuptr place to put the last required code unit + reqcuflagsptr place to put the last required code unit flags + bcptr pointer to the chain of currently open branches + cb points to the data block with tables pointers etc. + lengthptr NULL during the real compile phase + points to length accumulator during pre-compile phase + +Returns: 0 There has been an error + +1 Success, this group must match at least one character + -1 Success, this group may match an empty string +*/ + +static int +compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr, + uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits, + uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr, + uint32_t *reqcuflagsptr, branch_chain *bcptr, compile_block *cb, + PCRE2_SIZE *lengthptr) +{ +PCRE2_UCHAR *code = *codeptr; +PCRE2_UCHAR *last_branch = code; +PCRE2_UCHAR *start_bracket = code; +BOOL lookbehind; +open_capitem capitem; +int capnumber = 0; +int okreturn = 1; +uint32_t *pptr = *pptrptr; +uint32_t firstcu, reqcu; +uint32_t lookbehindlength; +uint32_t firstcuflags, reqcuflags; +uint32_t branchfirstcu, branchreqcu; +uint32_t branchfirstcuflags, branchreqcuflags; +PCRE2_SIZE length; +branch_chain bc; + +/* If set, call the external function that checks for stack availability. */ + +if (cb->cx->stack_guard != NULL && + cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) + { + *errorcodeptr= ERR33; + return 0; + } + +/* Miscellaneous initialization */ + +bc.outer = bcptr; +bc.current_branch = code; + +firstcu = reqcu = 0; +firstcuflags = reqcuflags = REQ_UNSET; + +/* Accumulate the length for use in the pre-compile phase. Start with the +length of the BRA and KET and any extra code units that are required at the +beginning. We accumulate in a local variable to save frequent testing of +lengthptr for NULL. We cannot do this by looking at the value of 'code' at the +start and end of each alternative, because compiled items are discarded during +the pre-compile phase so that the workspace is not exceeded. */ + +length = 2 + 2*LINK_SIZE + skipunits; + +/* Remember if this is a lookbehind assertion, and if it is, save its length +and skip over the pattern offset. */ + +lookbehind = *code == OP_ASSERTBACK || + *code == OP_ASSERTBACK_NOT || + *code == OP_ASSERTBACK_NA; + +if (lookbehind) + { + lookbehindlength = META_DATA(pptr[-1]); + pptr += SIZEOFFSET; + } +else lookbehindlength = 0; + +/* If this is a capturing subpattern, add to the chain of open capturing items +so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA +need be tested here; changing this opcode to one of its variants, e.g. +OP_SCBRAPOS, happens later, after the group has been compiled. */ + +if (*code == OP_CBRA) + { + capnumber = GET2(code, 1 + LINK_SIZE); + capitem.number = capnumber; + capitem.next = cb->open_caps; + capitem.assert_depth = cb->assert_depth; + cb->open_caps = &capitem; + } + +/* Offset is set zero to mark that this bracket is still open */ + +PUT(code, 1, 0); +code += 1 + LINK_SIZE + skipunits; + +/* Loop for each alternative branch */ + +for (;;) + { + int branch_return; + + /* Insert OP_REVERSE if this is as lookbehind assertion. */ + + if (lookbehind && lookbehindlength > 0) + { + *code++ = OP_REVERSE; + PUTINC(code, 0, lookbehindlength); + length += 1 + LINK_SIZE; + } + + /* Now compile the branch; in the pre-compile phase its length gets added + into the length. */ + + if ((branch_return = + compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr, + &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags, + &bc, cb, (lengthptr == NULL)? NULL : &length)) == 0) + return 0; + + /* If a branch can match an empty string, so can the whole group. */ + + if (branch_return < 0) okreturn = -1; + + /* In the real compile phase, there is some post-processing to be done. */ + + if (lengthptr == NULL) + { + /* If this is the first branch, the firstcu and reqcu values for the + branch become the values for the regex. */ + + if (*last_branch != OP_ALT) + { + firstcu = branchfirstcu; + firstcuflags = branchfirstcuflags; + reqcu = branchreqcu; + reqcuflags = branchreqcuflags; + } + + /* If this is not the first branch, the first char and reqcu have to + match the values from all the previous branches, except that if the + previous value for reqcu didn't have REQ_VARY set, it can still match, + and we set REQ_VARY for the group from this branch's value. */ + + else + { + /* If we previously had a firstcu, but it doesn't match the new branch, + we have to abandon the firstcu for the regex, but if there was + previously no reqcu, it takes on the value of the old firstcu. */ + + if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) + { + if (firstcuflags < REQ_NONE) + { + if (reqcuflags >= REQ_NONE) + { + reqcu = firstcu; + reqcuflags = firstcuflags; + } + } + firstcuflags = REQ_NONE; + } + + /* If we (now or from before) have no firstcu, a firstcu from the + branch becomes a reqcu if there isn't a branch reqcu. */ + + if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE && + branchreqcuflags >= REQ_NONE) + { + branchreqcu = branchfirstcu; + branchreqcuflags = branchfirstcuflags; + } + + /* Now ensure that the reqcus match */ + + if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || + reqcu != branchreqcu) + reqcuflags = REQ_NONE; + else + { + reqcu = branchreqcu; + reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */ + } + } + } + + /* Handle reaching the end of the expression, either ')' or end of pattern. + In the real compile phase, go back through the alternative branches and + reverse the chain of offsets, with the field in the BRA item now becoming an + offset to the first alternative. If there are no alternatives, it points to + the end of the group. The length in the terminating ket is always the length + of the whole bracketed item. Return leaving the pointer at the terminating + char. */ + + if (META_CODE(*pptr) != META_ALT) + { + if (lengthptr == NULL) + { + PCRE2_SIZE branch_length = code - last_branch; + do + { + PCRE2_SIZE prev_length = GET(last_branch, 1); + PUT(last_branch, 1, branch_length); + branch_length = prev_length; + last_branch -= branch_length; + } + while (branch_length > 0); + } + + /* Fill in the ket */ + + *code = OP_KET; + PUT(code, 1, (int)(code - start_bracket)); + code += 1 + LINK_SIZE; + + /* If it was a capturing subpattern, remove the block from the chain. */ + + if (capnumber > 0) cb->open_caps = cb->open_caps->next; + + /* Set values to pass back */ + + *codeptr = code; + *pptrptr = pptr; + *firstcuptr = firstcu; + *firstcuflagsptr = firstcuflags; + *reqcuptr = reqcu; + *reqcuflagsptr = reqcuflags; + if (lengthptr != NULL) + { + if (OFLOW_MAX - *lengthptr < length) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += length; + } + return okreturn; + } + + /* Another branch follows. In the pre-compile phase, we can move the code + pointer back to where it was for the start of the first branch. (That is, + pretend that each branch is the only one.) + + In the real compile phase, insert an ALT node. Its length field points back + to the previous branch while the bracket remains open. At the end the chain + is reversed. It's done like this so that the start of the bracket has a + zero offset until it is closed, making it possible to detect recursion. */ + + if (lengthptr != NULL) + { + code = *codeptr + 1 + LINK_SIZE + skipunits; + length += 1 + LINK_SIZE; + } + else + { + *code = OP_ALT; + PUT(code, 1, (int)(code - last_branch)); + bc.current_branch = last_branch = code; + code += 1 + LINK_SIZE; + } + + /* Set the lookbehind length (if not in a lookbehind the value will be zero) + and then advance past the vertical bar. */ + + lookbehindlength = META_DATA(*pptr); + pptr++; + } +/* Control never reaches here */ +} + + + +/************************************************* +* Check for anchored pattern * +*************************************************/ + +/* Try to find out if this is an anchored regular expression. Consider each +alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket +all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then +it's anchored. However, if this is a multiline pattern, then only OP_SOD will +be found, because ^ generates OP_CIRCM in that mode. + +We can also consider a regex to be anchored if OP_SOM starts all its branches. +This is the code for \G, which means "match at start of match position, taking +into account the match offset". + +A branch is also implicitly anchored if it starts with .* and DOTALL is set, +because that will try the rest of the pattern at all possible matching points, +so there is no point trying again.... er .... + +.... except when the .* appears inside capturing parentheses, and there is a +subsequent back reference to those parentheses. We haven't enough information +to catch that case precisely. + +At first, the best we could do was to detect when .* was in capturing brackets +and the highest back reference was greater than or equal to that level. +However, by keeping a bitmap of the first 31 back references, we can catch some +of the more common cases more precisely. + +... A second exception is when the .* appears inside an atomic group, because +this prevents the number of characters it matches from being adjusted. + +Arguments: + code points to start of the compiled pattern + bracket_map a bitmap of which brackets we are inside while testing; this + handles up to substring 31; after that we just have to take + the less precise approach + cb points to the compile data block + atomcount atomic group level + inassert TRUE if in an assertion + +Returns: TRUE or FALSE +*/ + +static BOOL +is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb, + int atomcount, BOOL inassert) +{ +do { + PCRE2_SPTR scode = first_significant_code( + code + PRIV(OP_lengths)[*code], FALSE); + int op = *scode; + + /* Non-capturing brackets */ + + if (op == OP_BRA || op == OP_BRAPOS || + op == OP_SBRA || op == OP_SBRAPOS) + { + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; + } + + /* Capturing brackets */ + + else if (op == OP_CBRA || op == OP_CBRAPOS || + op == OP_SCBRA || op == OP_SCBRAPOS) + { + int n = GET2(scode, 1+LINK_SIZE); + uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1); + if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; + } + + /* Positive forward assertion */ + + else if (op == OP_ASSERT || op == OP_ASSERT_NA) + { + if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + } + + /* Condition. If there is no second branch, it can't be anchored. */ + + else if (op == OP_COND || op == OP_SCOND) + { + if (scode[GET(scode,1)] != OP_ALT) return FALSE; + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; + } + + /* Atomic groups */ + + else if (op == OP_ONCE) + { + if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) + return FALSE; + } + + /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and + it isn't in brackets that are or may be referenced or inside an atomic + group or an assertion. Also the pattern must not contain *PRUNE or *SKIP, + because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/ + with the subject "aab", which matches "b", i.e. not at the start of a line. + There is also an option that disables auto-anchoring. */ + + else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || + op == OP_TYPEPOSSTAR)) + { + if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || + atomcount > 0 || cb->had_pruneorskip || inassert || + (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + return FALSE; + } + + /* Check for explicit anchoring */ + + else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; + + code += GET(code, 1); + } +while (*code == OP_ALT); /* Loop for each alternative */ +return TRUE; +} + + + +/************************************************* +* Check for starting with ^ or .* * +*************************************************/ + +/* This is called to find out if every branch starts with ^ or .* so that +"first char" processing can be done to speed things up in multiline +matching and for non-DOTALL patterns that start with .* (which must start at +the beginning or after \n). As in the case of is_anchored() (see above), we +have to take account of back references to capturing brackets that contain .* +because in that case we can't make the assumption. Also, the appearance of .* +inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE +or *SKIP does not count, because once again the assumption no longer holds. + +Arguments: + code points to start of the compiled pattern or a group + bracket_map a bitmap of which brackets we are inside while testing; this + handles up to substring 31; after that we just have to take + the less precise approach + cb points to the compile data + atomcount atomic group level + inassert TRUE if in an assertion + +Returns: TRUE or FALSE +*/ + +static BOOL +is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, + int atomcount, BOOL inassert) +{ +do { + PCRE2_SPTR scode = first_significant_code( + code + PRIV(OP_lengths)[*code], FALSE); + int op = *scode; + + /* If we are at the start of a conditional assertion group, *both* the + conditional assertion *and* what follows the condition must satisfy the test + for start of line. Other kinds of condition fail. Note that there may be an + auto-callout at the start of a condition. */ + + if (op == OP_COND) + { + scode += 1 + LINK_SIZE; + + if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; + else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); + + switch (*scode) + { + case OP_CREF: + case OP_DNCREF: + case OP_RREF: + case OP_DNRREF: + case OP_FAIL: + case OP_FALSE: + case OP_TRUE: + return FALSE; + + default: /* Assertion */ + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + do scode += GET(scode, 1); while (*scode == OP_ALT); + scode += 1 + LINK_SIZE; + break; + } + scode = first_significant_code(scode, FALSE); + op = *scode; + } + + /* Non-capturing brackets */ + + if (op == OP_BRA || op == OP_BRAPOS || + op == OP_SBRA || op == OP_SBRAPOS) + { + if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; + } + + /* Capturing brackets */ + + else if (op == OP_CBRA || op == OP_CBRAPOS || + op == OP_SCBRA || op == OP_SCBRAPOS) + { + int n = GET2(scode, 1+LINK_SIZE); + unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1); + if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; + } + + /* Positive forward assertions */ + + else if (op == OP_ASSERT || op == OP_ASSERT_NA) + { + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) + return FALSE; + } + + /* Atomic brackets */ + + else if (op == OP_ONCE) + { + if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) + return FALSE; + } + + /* .* means "start at start or after \n" if it isn't in atomic brackets or + brackets that may be referenced or an assertion, and as long as the pattern + does not contain *PRUNE or *SKIP, because these break the feature. Consider, + for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", + i.e. not at the start of a line. There is also an option that disables this + optimization. */ + + else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) + { + if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || + atomcount > 0 || cb->had_pruneorskip || inassert || + (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + return FALSE; + } + + /* Check for explicit circumflex; anything else gives a FALSE result. Note + in particular that this includes atomic brackets OP_ONCE because the number + of characters matched by .* cannot be adjusted inside them. */ + + else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; + + /* Move on to the next alternative */ + + code += GET(code, 1); + } +while (*code == OP_ALT); /* Loop for each alternative */ +return TRUE; +} + + + +/************************************************* +* Scan compiled regex for recursion reference * +*************************************************/ + +/* This function scans through a compiled pattern until it finds an instance of +OP_RECURSE. + +Arguments: + code points to start of expression + utf TRUE in UTF mode + +Returns: pointer to the opcode for OP_RECURSE, or NULL if not found +*/ + +static PCRE2_SPTR +find_recurse(PCRE2_SPTR code, BOOL utf) +{ +for (;;) + { + PCRE2_UCHAR c = *code; + if (c == OP_END) return NULL; + if (c == OP_RECURSE) return code; + + /* XCLASS is used for classes that cannot be represented just by a bit map. + This includes negated single high-valued characters. CALLOUT_STR is used for + callouts with string arguments. In both cases the length in the table is + zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); + + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, + we must add in its length. */ + + else + { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEPOSUPTO: + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) + code += 2; + break; + + case OP_MARK: + case OP_COMMIT_ARG: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; + } + + /* Add in the fixed length from the table */ + + code += PRIV(OP_lengths)[c]; + + /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may + be followed by a multi-unit character. The length in the table is a + minimum, so we have to arrange to skip the extra units. */ + +#ifdef MAYBE_UTF_MULTI + if (utf) switch(c) + { + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + case OP_UPTO: + case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_MINUPTO: + case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_POSUPTO: + case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + case OP_STAR: + case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_MINSTAR: + case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_POSSTAR: + case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_PLUS: + case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_MINPLUS: + case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_POSPLUS: + case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + case OP_QUERY: + case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_MINQUERY: + case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_POSQUERY: + case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); + break; + } +#else + (void)(utf); /* Keep compiler happy by referencing function argument */ +#endif /* MAYBE_UTF_MULTI */ + } + } +} + + + +/************************************************* +* Check for asserted fixed first code unit * +*************************************************/ + +/* During compilation, the "first code unit" settings from forward assertions +are discarded, because they can cause conflicts with actual literals that +follow. However, if we end up without a first code unit setting for an +unanchored pattern, it is worth scanning the regex to see if there is an +initial asserted first code unit. If all branches start with the same asserted +code unit, or with a non-conditional bracket all of whose alternatives start +with the same asserted code unit (recurse ad lib), then we return that code +unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with +REQ_NONE in the flags. + +Arguments: + code points to start of compiled pattern + flags points to the first code unit flags + inassert non-zero if in an assertion + +Returns: the fixed first code unit, or 0 with REQ_NONE in flags +*/ + +static uint32_t +find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert) +{ +uint32_t c = 0; +uint32_t cflags = REQ_NONE; + +*flags = REQ_NONE; +do { + uint32_t d; + uint32_t dflags; + int xl = (*code == OP_CBRA || *code == OP_SCBRA || + *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; + PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); + PCRE2_UCHAR op = *scode; + + switch(op) + { + default: + return 0; + + case OP_BRA: + case OP_BRAPOS: + case OP_CBRA: + case OP_SCBRA: + case OP_CBRAPOS: + case OP_SCBRAPOS: + case OP_ASSERT: + case OP_ASSERT_NA: + case OP_ONCE: + case OP_SCRIPT_RUN: + d = find_firstassertedcu(scode, &dflags, inassert + + ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0)); + if (dflags >= REQ_NONE) return 0; + if (cflags >= REQ_NONE) { c = d; cflags = dflags; } + else if (c != d || cflags != dflags) return 0; + break; + + case OP_EXACT: + scode += IMM2_SIZE; + /* Fall through */ + + case OP_CHAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + if (inassert == 0) return 0; + if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; } + else if (c != scode[1]) return 0; + break; + + case OP_EXACTI: + scode += IMM2_SIZE; + /* Fall through */ + + case OP_CHARI: + case OP_PLUSI: + case OP_MINPLUSI: + case OP_POSPLUSI: + if (inassert == 0) return 0; + + /* If the character is more than one code unit long, we cannot set its + first code unit when matching caselessly. Later scanning may pick up + multiple code units. */ + +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (scode[1] >= 0x80) return 0; +#elif PCRE2_CODE_UNIT_WIDTH == 16 + if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0; +#endif +#endif + + if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; } + else if (c != scode[1]) return 0; + break; + } + + code += GET(code, 1); + } +while (*code == OP_ALT); + +*flags = cflags; +return c; +} + + + +/************************************************* +* Add an entry to the name/number table * +*************************************************/ + +/* This function is called between compiling passes to add an entry to the +name/number table, maintaining alphabetical order. Checking for permitted +and forbidden duplicates has already been done. + +Arguments: + cb the compile data block + name the name to add + length the length of the name + groupno the group number + tablecount the count of names in the table so far + +Returns: nothing +*/ + +static void +add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, + unsigned int groupno, uint32_t tablecount) +{ +uint32_t i; +PCRE2_UCHAR *slot = cb->name_table; + +for (i = 0; i < tablecount; i++) + { + int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length)); + if (crc == 0 && slot[IMM2_SIZE+length] != 0) + crc = -1; /* Current name is a substring */ + + /* Make space in the table and break the loop for an earlier name. For a + duplicate or later name, carry on. We do this for duplicates so that in the + simple case (when ?(| is not used) they are in order of their numbers. In all + cases they are in the order in which they appear in the pattern. */ + + if (crc < 0) + { + (void)memmove(slot + cb->name_entry_size, slot, + CU2BYTES((tablecount - i) * cb->name_entry_size)); + break; + } + + /* Continue the loop for a later or duplicate name */ + + slot += cb->name_entry_size; + } + +PUT2(slot, 0, groupno); +memcpy(slot + IMM2_SIZE, name, CU2BYTES(length)); + +/* Add a terminating zero and fill the rest of the slot with zeroes so that +the memory is all initialized. Otherwise valgrind moans about uninitialized +memory when saving serialized compiled patterns. */ + +memset(slot + IMM2_SIZE + length, 0, + CU2BYTES(cb->name_entry_size - length - IMM2_SIZE)); +} + + + +/************************************************* +* Skip in parsed pattern * +*************************************************/ + +/* This function is called to skip parts of the parsed pattern when finding the +length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find +the end of the branch, it is called to skip over an internal lookaround or +(DEFINE) group, and it is also called to skip to the end of a class, during +which it will never encounter nested groups (but there's no need to have +special code for that). + +When called to find the end of a branch or group, pptr must point to the first +meta code inside the branch, not the branch-starting code. In other cases it +can point to the item that causes the function to be called. + +Arguments: + pptr current pointer to skip from + skiptype PSKIP_CLASS when skipping to end of class + PSKIP_ALT when META_ALT ends the skip + PSKIP_KET when only META_KET ends the skip + +Returns: new value of pptr + NULL if META_END is reached - should never occur + or for an unknown meta value - likewise +*/ + +static uint32_t * +parsed_skip(uint32_t *pptr, uint32_t skiptype) +{ +uint32_t nestlevel = 0; + +for (;; pptr++) + { + uint32_t meta = META_CODE(*pptr); + + switch(meta) + { + default: /* Just skip over most items */ + if (meta < META_END) continue; /* Literal */ + break; + + /* This should never occur. */ + + case META_END: + return NULL; + + /* The data for these items is variable in length. */ + + case META_BACKREF: /* Offset is present only if group >= 10 */ + if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET; + break; + + case META_ESCAPE: /* A few escapes are followed by data items. */ + switch (META_DATA(*pptr)) + { + case ESC_P: + case ESC_p: + pptr += 1; + break; + + case ESC_g: + case ESC_k: + pptr += 1 + SIZEOFFSET; + break; + } + break; + + case META_MARK: /* Add the length of the name. */ + case META_COMMIT_ARG: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + pptr += pptr[1]; + break; + + /* These are the "active" items in this loop. */ + + case META_CLASS_END: + if (skiptype == PSKIP_CLASS) return pptr; + break; + + case META_ATOMIC: + case META_CAPTURE: + case META_COND_ASSERT: + case META_COND_DEFINE: + case META_COND_NAME: + case META_COND_NUMBER: + case META_COND_RNAME: + case META_COND_RNUMBER: + case META_COND_VERSION: + case META_LOOKAHEAD: + case META_LOOKAHEADNOT: + case META_LOOKAHEAD_NA: + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + case META_LOOKBEHIND_NA: + case META_NOCAPTURE: + case META_SCRIPT_RUN: + nestlevel++; + break; + + case META_ALT: + if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr; + break; + + case META_KET: + if (nestlevel == 0) return pptr; + nestlevel--; + break; + } + + /* The extra data item length for each meta is in a table. */ + + meta = (meta >> 16) & 0x7fff; + if (meta >= sizeof(meta_extra_lengths)) return NULL; + pptr += meta_extra_lengths[meta]; + } +/* Control never reaches here */ +return pptr; +} + + + +/************************************************* +* Find length of a parsed group * +*************************************************/ + +/* This is called for nested groups within a branch of a lookbehind whose +length is being computed. If all the branches in the nested group have the same +length, that is OK. On entry, the pointer must be at the first element after +the group initializing code. On exit it points to OP_KET. Caching is used to +improve processing speed when the same capturing group occurs many times. + +Arguments: + pptrptr pointer to pointer in the parsed pattern + isinline FALSE if a reference or recursion; TRUE for inline group + errcodeptr pointer to the errorcode + lcptr pointer to the loop counter + group number of captured group or -1 for a non-capturing group + recurses chain of recurse_check to catch mutual recursion + cb pointer to the compile data + +Returns: the group length or a negative number +*/ + +static int +get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr, + int group, parsed_recurse_check *recurses, compile_block *cb) +{ +int branchlength; +int grouplength = -1; + +/* The cache can be used only if there is no possibility of there being two +groups with the same number. We do not need to set the end pointer for a group +that is being processed as a back reference or recursion, but we must do so for +an inline group. */ + +if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0) + { + uint32_t groupinfo = cb->groupinfo[group]; + if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1; + if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) + { + if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET); + return groupinfo & GI_FIXED_LENGTH_MASK; + } + } + +/* Scan the group. In this case we find the end pointer of necessity. */ + +for(;;) + { + branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); + if (branchlength < 0) goto ISNOTFIXED; + if (grouplength == -1) grouplength = branchlength; + else if (grouplength != branchlength) goto ISNOTFIXED; + if (**pptrptr == META_KET) break; + *pptrptr += 1; /* Skip META_ALT */ + } + +if (group > 0) + cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength); +return grouplength; + +ISNOTFIXED: +if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH; +return -1; +} + + + +/************************************************* +* Find length of a parsed branch * +*************************************************/ + +/* Return a fixed length for a branch in a lookbehind, giving an error if the +length is not fixed. On entry, *pptrptr points to the first element inside the +branch. On exit it is set to point to the ALT or KET. + +Arguments: + pptrptr pointer to pointer in the parsed pattern + errcodeptr pointer to error code + lcptr pointer to loop counter + recurses chain of recurse_check to catch mutual recursion + cb pointer to compile block + +Returns: the length, or a negative value on error +*/ + +static int +get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, + parsed_recurse_check *recurses, compile_block *cb) +{ +int branchlength = 0; +int grouplength; +uint32_t lastitemlength = 0; +uint32_t *pptr = *pptrptr; +PCRE2_SIZE offset; +parsed_recurse_check this_recurse; + +/* A large and/or complex regex can take too long to process. This can happen +more often when (?| groups are present in the pattern because their length +cannot be cached. */ + +if ((*lcptr)++ > 2000) + { + *errcodeptr = ERR35; /* Lookbehind is too complicated */ + return -1; + } + +/* Scan the branch, accumulating the length. */ + +for (;; pptr++) + { + parsed_recurse_check *r; + uint32_t *gptr, *gptrend; + uint32_t escape; + uint32_t group = 0; + uint32_t itemlength = 0; + + if (*pptr < META_END) + { + itemlength = 1; + } + + else switch (META_CODE(*pptr)) + { + case META_KET: + case META_ALT: + goto EXIT; + + /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the + actual termination. */ + + case META_ACCEPT: + case META_FAIL: + pptr = parsed_skip(pptr, PSKIP_ALT); + if (pptr == NULL) goto PARSED_SKIP_FAILED; + goto EXIT; + + case META_MARK: + case META_COMMIT_ARG: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + pptr += pptr[1] + 1; + break; + + case META_CIRCUMFLEX: + case META_COMMIT: + case META_DOLLAR: + case META_PRUNE: + case META_SKIP: + case META_THEN: + break; + + case META_OPTIONS: + pptr += 2; + break; + + case META_BIGVALUE: + itemlength = 1; + pptr += 1; + break; + + case META_CLASS: + case META_CLASS_NOT: + itemlength = 1; + pptr = parsed_skip(pptr, PSKIP_CLASS); + if (pptr == NULL) goto PARSED_SKIP_FAILED; + break; + + case META_CLASS_EMPTY_NOT: + case META_DOT: + itemlength = 1; + break; + + case META_CALLOUT_NUMBER: + pptr += 3; + break; + + case META_CALLOUT_STRING: + pptr += 3 + SIZEOFFSET; + break; + + /* Only some escapes consume a character. Of those, \R and \X are never + allowed because they might match more than character. \C is allowed only in + 32-bit and non-UTF 8/16-bit modes. */ + + case META_ESCAPE: + escape = META_DATA(*pptr); + if (escape == ESC_R || escape == ESC_X) return -1; + if (escape > ESC_b && escape < ESC_Z) + { +#if PCRE2_CODE_UNIT_WIDTH != 32 + if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C) + { + *errcodeptr = ERR36; + return -1; + } +#endif + itemlength = 1; + if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */ + } + break; + + /* Lookaheads do not contribute to the length of this branch, but they may + contain lookbehinds within them whose lengths need to be set. */ + + case META_LOOKAHEAD: + case META_LOOKAHEADNOT: + case META_LOOKAHEAD_NA: + *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr); + if (*errcodeptr != 0) return -1; + + /* Ignore any qualifiers that follow a lookahead assertion. */ + + switch (pptr[1]) + { + case META_ASTERISK: + case META_ASTERISK_PLUS: + case META_ASTERISK_QUERY: + case META_PLUS: + case META_PLUS_PLUS: + case META_PLUS_QUERY: + case META_QUERY: + case META_QUERY_PLUS: + case META_QUERY_QUERY: + pptr++; + break; + + case META_MINMAX: + case META_MINMAX_PLUS: + case META_MINMAX_QUERY: + pptr += 3; + break; + + default: + break; + } + break; + + /* A nested lookbehind does not contribute any length to this lookbehind, + but must itself be checked and have its lengths set. */ + + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + case META_LOOKBEHIND_NA: + if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb)) + return -1; + break; + + /* Back references and recursions are handled by very similar code. At this + stage, the names generated in the parsing pass are available, but the main + name table has not yet been created. So for the named varieties, scan the + list of names in order to get the number of the first one in the pattern, + and whether or not this name is duplicated. */ + + case META_BACKREF_BYNAME: + if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0) + goto ISNOTFIXED; + /* Fall through */ + + case META_RECURSE_BYNAME: + { + int i; + PCRE2_SPTR name; + BOOL is_dupname = FALSE; + named_group *ng = cb->named_groups; + uint32_t meta_code = META_CODE(*pptr); + uint32_t length = *(++pptr); + + GETPLUSOFFSET(offset, pptr); + name = cb->start_pattern + offset; + for (i = 0; i < cb->names_found; i++, ng++) + { + if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0) + { + group = ng->number; + is_dupname = ng->isdup; + break; + } + } + + if (group == 0) + { + *errcodeptr = ERR15; /* Non-existent subpattern */ + cb->erroroffset = offset; + return -1; + } + + /* A numerical back reference can be fixed length if duplicate capturing + groups are not being used. A non-duplicate named back reference can also + be handled. */ + + if (meta_code == META_RECURSE_BYNAME || + (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)) + goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */ + } + goto ISNOTFIXED; /* Duplicate name or number */ + + /* The offset values for back references < 10 are in a separate vector + because otherwise they would use more than two parsed pattern elements on + 64-bit systems. */ + + case META_BACKREF: + if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 || + (cb->external_flags & PCRE2_DUPCAPUSED) != 0) + goto ISNOTFIXED; + group = META_DATA(*pptr); + if (group < 10) + { + offset = cb->small_ref_offset[group]; + goto RECURSE_OR_BACKREF_LENGTH; + } + + /* Fall through */ + /* For groups >= 10 - picking up group twice does no harm. */ + + /* A true recursion implies not fixed length, but a subroutine call may + be OK. Back reference "recursions" are also failed. */ + + case META_RECURSE: + group = META_DATA(*pptr); + GETPLUSOFFSET(offset, pptr); + + RECURSE_OR_BACKREF_LENGTH: + if (group > cb->bracount) + { + cb->erroroffset = offset; + *errcodeptr = ERR15; /* Non-existent subpattern */ + return -1; + } + if (group == 0) goto ISNOTFIXED; /* Local recursion */ + for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++) + { + if (META_CODE(*gptr) == META_BIGVALUE) gptr++; + else if (*gptr == (META_CAPTURE | group)) break; + } + + /* We must start the search for the end of the group at the first meta code + inside the group. Otherwise it will be treated as an enclosed group. */ + + gptrend = parsed_skip(gptr + 1, PSKIP_KET); + if (gptrend == NULL) goto PARSED_SKIP_FAILED; + if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */ + for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break; + if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ + this_recurse.prev = recurses; + this_recurse.groupptr = gptr; + + /* We do not need to know the position of the end of the group, that is, + gptr is not used after the call to get_grouplength(). Setting the second + argument FALSE stops it scanning for the end when the length can be found + in the cache. */ + + gptr++; + grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group, + &this_recurse, cb); + if (grouplength < 0) + { + if (*errcodeptr == 0) goto ISNOTFIXED; + return -1; /* Error already set */ + } + itemlength = grouplength; + break; + + /* A (DEFINE) group is never obeyed inline and so it does not contribute to + the length of this branch. Skip from the following item to the next + unpaired ket. */ + + case META_COND_DEFINE: + pptr = parsed_skip(pptr + 1, PSKIP_KET); + break; + + /* Check other nested groups - advance past the initial data for each type + and then seek a fixed length with get_grouplength(). */ + + case META_COND_NAME: + case META_COND_NUMBER: + case META_COND_RNAME: + case META_COND_RNUMBER: + pptr += 2 + SIZEOFFSET; + goto CHECK_GROUP; + + case META_COND_ASSERT: + pptr += 1; + goto CHECK_GROUP; + + case META_COND_VERSION: + pptr += 4; + goto CHECK_GROUP; + + case META_CAPTURE: + group = META_DATA(*pptr); + /* Fall through */ + + case META_ATOMIC: + case META_NOCAPTURE: + case META_SCRIPT_RUN: + pptr++; + CHECK_GROUP: + grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group, + recurses, cb); + if (grouplength < 0) return -1; + itemlength = grouplength; + break; + + /* Exact repetition is OK; variable repetition is not. A repetition of zero + must subtract the length that has already been added. */ + + case META_MINMAX: + case META_MINMAX_PLUS: + case META_MINMAX_QUERY: + if (pptr[1] == pptr[2]) + { + switch(pptr[1]) + { + case 0: + branchlength -= lastitemlength; + break; + + case 1: + itemlength = 0; + break; + + default: /* Check for integer overflow */ + if (lastitemlength != 0 && /* Should not occur, but just in case */ + INT_MAX/lastitemlength < pptr[1] - 1) + { + *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */ + return -1; + } + itemlength = (pptr[1] - 1) * lastitemlength; + break; + } + pptr += 2; + break; + } + /* Fall through */ + + /* Any other item means this branch does not have a fixed length. */ + + default: + ISNOTFIXED: + *errcodeptr = ERR25; /* Not fixed length */ + return -1; + } + + /* Add the item length to the branchlength, checking for integer overflow and + for the branch length exceeding the limit. */ + + if (INT_MAX - branchlength < (int)itemlength || + (branchlength += itemlength) > LOOKBEHIND_MAX) + { + *errcodeptr = ERR87; + return -1; + } + + /* Save this item length for use if the next item is a quantifier. */ + + lastitemlength = itemlength; + } + +EXIT: +*pptrptr = pptr; +return branchlength; + +PARSED_SKIP_FAILED: +*errcodeptr = ERR90; +return -1; +} + + + +/************************************************* +* Set lengths in a lookbehind * +*************************************************/ + +/* This function is called for each lookbehind, to set the lengths in its +branches. An error occurs if any branch does not have a fixed length that is +less than the maximum (65535). On exit, the pointer must be left on the final +ket. + +The function also maintains the max_lookbehind value. Any lookbehind branch +that contains a nested lookbehind may actually look further back than the +length of the branch. The additional amount is passed back from +get_branchlength() as an "extra" value. + +Arguments: + pptrptr pointer to pointer in the parsed pattern + errcodeptr pointer to error code + lcptr pointer to loop counter + recurses chain of recurse_check to catch mutual recursion + cb pointer to compile block + +Returns: TRUE if all is well + FALSE otherwise, with error code and offset set +*/ + +static BOOL +set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, + parsed_recurse_check *recurses, compile_block *cb) +{ +PCRE2_SIZE offset; +int branchlength; +uint32_t *bptr = *pptrptr; + +READPLUSOFFSET(offset, bptr); /* Offset for error messages */ +*pptrptr += SIZEOFFSET; + +do + { + *pptrptr += 1; + branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); + if (branchlength < 0) + { + /* The errorcode and offset may already be set from a nested lookbehind. */ + if (*errcodeptr == 0) *errcodeptr = ERR25; + if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset; + return FALSE; + } + if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength; + *bptr |= branchlength; /* branchlength never more than 65535 */ + bptr = *pptrptr; + } +while (*bptr == META_ALT); + +return TRUE; +} + + + +/************************************************* +* Check parsed pattern lookbehinds * +*************************************************/ + +/* This function is called at the end of parsing a pattern if any lookbehinds +were encountered. It scans the parsed pattern for them, calling +set_lookbehind_lengths() for each one. At the start, the errorcode is zero and +the error offset is marked unset. The enables the functions above not to +override settings from deeper nestings. + +This function is called recursively from get_branchlength() for lookaheads in +order to process any lookbehinds that they may contain. It stops when it hits a +non-nested closing parenthesis in this case, returning a pointer to it. + +Arguments + pptr points to where to start (start of pattern or start of lookahead) + retptr if not NULL, return the ket pointer here + recurses chain of recurse_check to catch mutual recursion + cb points to the compile block + lcptr points to loop counter + +Returns: 0 on success, or an errorcode (cb->erroroffset will be set) +*/ + +static int +check_lookbehinds(uint32_t *pptr, uint32_t **retptr, + parsed_recurse_check *recurses, compile_block *cb, int *lcptr) +{ +int errorcode = 0; +int nestlevel = 0; + +cb->erroroffset = PCRE2_UNSET; + +for (; *pptr != META_END; pptr++) + { + if (*pptr < META_END) continue; /* Literal */ + + switch (META_CODE(*pptr)) + { + default: + return ERR70; /* Unrecognized meta code */ + + case META_ESCAPE: + if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p) + pptr += 1; + break; + + case META_KET: + if (--nestlevel < 0) + { + if (retptr != NULL) *retptr = pptr; + return 0; + } + break; + + case META_ATOMIC: + case META_CAPTURE: + case META_COND_ASSERT: + case META_LOOKAHEAD: + case META_LOOKAHEADNOT: + case META_LOOKAHEAD_NA: + case META_NOCAPTURE: + case META_SCRIPT_RUN: + nestlevel++; + break; + + case META_ACCEPT: + case META_ALT: + case META_ASTERISK: + case META_ASTERISK_PLUS: + case META_ASTERISK_QUERY: + case META_BACKREF: + case META_CIRCUMFLEX: + case META_CLASS: + case META_CLASS_EMPTY: + case META_CLASS_EMPTY_NOT: + case META_CLASS_END: + case META_CLASS_NOT: + case META_COMMIT: + case META_DOLLAR: + case META_DOT: + case META_FAIL: + case META_PLUS: + case META_PLUS_PLUS: + case META_PLUS_QUERY: + case META_PRUNE: + case META_QUERY: + case META_QUERY_PLUS: + case META_QUERY_QUERY: + case META_RANGE_ESCAPED: + case META_RANGE_LITERAL: + case META_SKIP: + case META_THEN: + break; + + case META_RECURSE: + pptr += SIZEOFFSET; + break; + + case META_BACKREF_BYNAME: + case META_RECURSE_BYNAME: + pptr += 1 + SIZEOFFSET; + break; + + case META_COND_DEFINE: + pptr += SIZEOFFSET; + nestlevel++; + break; + + case META_COND_NAME: + case META_COND_NUMBER: + case META_COND_RNAME: + case META_COND_RNUMBER: + pptr += 1 + SIZEOFFSET; + nestlevel++; + break; + + case META_COND_VERSION: + pptr += 3; + nestlevel++; + break; + + case META_CALLOUT_STRING: + pptr += 3 + SIZEOFFSET; + break; + + case META_BIGVALUE: + case META_POSIX: + case META_POSIX_NEG: + pptr += 1; + break; + + case META_MINMAX: + case META_MINMAX_QUERY: + case META_MINMAX_PLUS: + case META_OPTIONS: + pptr += 2; + break; + + case META_CALLOUT_NUMBER: + pptr += 3; + break; + + case META_MARK: + case META_COMMIT_ARG: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + pptr += 1 + pptr[1]; + break; + + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + case META_LOOKBEHIND_NA: + if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb)) + return errorcode; + break; + } + } + +return 0; +} + + + +/************************************************* +* External function to compile a pattern * +*************************************************/ + +/* This function reads a regular expression in the form of a string and returns +a pointer to a block of store holding a compiled version of the expression. + +Arguments: + pattern the regular expression + patlen the length of the pattern, or PCRE2_ZERO_TERMINATED + options option bits + errorptr pointer to errorcode + erroroffset pointer to error offset + ccontext points to a compile context or is NULL + +Returns: pointer to compiled data block, or NULL on error, + with errorcode and erroroffset set +*/ + +PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION +pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, + int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) +{ +BOOL utf; /* Set TRUE for UTF mode */ +BOOL ucp; /* Set TRUE for UCP mode */ +BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ +BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ +pcre2_real_code *re = NULL; /* What we will return */ +compile_block cb; /* "Static" compile-time data */ +const uint8_t *tables; /* Char tables base pointer */ + +PCRE2_UCHAR *code; /* Current pointer in compiled code */ +PCRE2_SPTR codestart; /* Start of compiled code */ +PCRE2_SPTR ptr; /* Current pointer in pattern */ +uint32_t *pptr; /* Current pointer in parsed pattern */ + +PCRE2_SIZE length = 1; /* Allow for final END opcode */ +PCRE2_SIZE usedlength; /* Actual length used */ +PCRE2_SIZE re_blocksize; /* Size of memory block */ +PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */ +PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */ + +uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ +uint32_t firstcu, reqcu; /* Value of first/req code unit */ +uint32_t setflags = 0; /* NL and BSR set flags */ + +uint32_t skipatstart; /* When checking (*UTF) etc */ +uint32_t limit_heap = UINT32_MAX; +uint32_t limit_match = UINT32_MAX; /* Unset match limits */ +uint32_t limit_depth = UINT32_MAX; + +int newline = 0; /* Unset; can be set by the pattern */ +int bsr = 0; /* Unset; can be set by the pattern */ +int errorcode = 0; /* Initialize to avoid compiler warn */ +int regexrc; /* Return from compile */ + +uint32_t i; /* Local loop counter */ + +/* Comments at the head of this file explain about these variables. */ + +uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE]; +uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE]; +named_group named_groups[NAMED_GROUP_LIST_SIZE]; + +/* The workspace is used in different ways in the different compiling phases. +It needs to be 16-bit aligned for the preliminary parsing scan. */ + +uint32_t c16workspace[C16_WORK_SIZE]; +PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace; + + +/* -------------- Check arguments and set up the pattern ----------------- */ + +/* There must be error code and offset pointers. */ + +if (errorptr == NULL || erroroffset == NULL) return NULL; +*errorptr = ERR0; +*erroroffset = 0; + +/* There must be a pattern! */ + +if (pattern == NULL) + { + *errorptr = ERR16; + return NULL; + } + +/* A NULL compile context means "use a default context" */ + +if (ccontext == NULL) + ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); + +/* PCRE2_MATCH_INVALID_UTF implies UTF */ + +if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF; + +/* Check that all undefined public option bits are zero. */ + +if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 || + (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0) + { + *errorptr = ERR17; + return NULL; + } + +if ((options & PCRE2_LITERAL) != 0 && + ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 || + (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0)) + { + *errorptr = ERR92; + return NULL; + } + +/* A zero-terminated pattern is indicated by the special length value +PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */ + +if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED))) + patlen = PRIV(strlen)(pattern); + +if (patlen > ccontext->max_pattern_length) + { + *errorptr = ERR88; + return NULL; + } + +/* From here on, all returns from this function should end up going via the +EXIT label. */ + + +/* ------------ Initialize the "static" compile data -------------- */ + +tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables); + +cb.lcc = tables + lcc_offset; /* Individual */ +cb.fcc = tables + fcc_offset; /* character */ +cb.cbits = tables + cbits_offset; /* tables */ +cb.ctypes = tables + ctypes_offset; + +cb.assert_depth = 0; +cb.bracount = 0; +cb.cx = ccontext; +cb.dupnames = FALSE; +cb.end_pattern = pattern + patlen; +cb.erroroffset = 0; +cb.external_flags = 0; +cb.external_options = options; +cb.groupinfo = stack_groupinfo; +cb.had_recurse = FALSE; +cb.lastcapture = 0; +cb.max_lookbehind = 0; +cb.name_entry_size = 0; +cb.name_table = NULL; +cb.named_groups = named_groups; +cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; +cb.names_found = 0; +cb.open_caps = NULL; +cb.parens_depth = 0; +cb.parsed_pattern = stack_parsed_pattern; +cb.req_varyopt = 0; +cb.start_code = cworkspace; +cb.start_pattern = pattern; +cb.start_workspace = cworkspace; +cb.workspace_size = COMPILE_WORK_SIZE; + +/* Maximum back reference and backref bitmap. The bitmap records up to 31 back +references to help in deciding whether (.*) can be treated as anchored or not. +*/ + +cb.top_backref = 0; +cb.backref_map = 0; + +/* Escape sequences \1 to \9 are always back references, but as they are only +two characters long, only two elements can be used in the parsed_pattern +vector. The first contains the reference, and we'd like to use the second to +record the offset in the pattern, so that forward references to non-existent +groups can be diagnosed later with an offset. However, on 64-bit systems, +PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first +occurrence of \1 to \9, indexed by the second parsed_pattern value. All other +references have enough space for the offset to be put into the parsed pattern. +*/ + +for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET; + + +/* --------------- Start looking at the pattern --------------- */ + +/* Unless PCRE2_LITERAL is set, check for global one-time option settings at +the start of the pattern, and remember the offset to the actual regex. With +valgrind support, make the terminator of a zero-terminated pattern +inaccessible. This catches bugs that would otherwise only show up for +non-zero-terminated patterns. */ + +#ifdef SUPPORT_VALGRIND +if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); +#endif + +ptr = pattern; +skipatstart = 0; + +if ((options & PCRE2_LITERAL) == 0) + { + while (patlen - skipatstart >= 2 && + ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && + ptr[skipatstart+1] == CHAR_ASTERISK) + { + for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) + { + uint32_t c, pp; + pso *p = pso_list + i; + + if (patlen - skipatstart - 2 >= p->length && + PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name), + p->length) == 0) + { + skipatstart += p->length + 2; + switch(p->type) + { + case PSO_OPT: + cb.external_options |= p->value; + break; + + case PSO_FLG: + setflags |= p->value; + break; + + case PSO_NL: + newline = p->value; + setflags |= PCRE2_NL_SET; + break; + + case PSO_BSR: + bsr = p->value; + setflags |= PCRE2_BSR_SET; + break; + + case PSO_LIMM: + case PSO_LIMD: + case PSO_LIMH: + c = 0; + pp = skipatstart; + if (!IS_DIGIT(ptr[pp])) + { + errorcode = ERR60; + ptr += pp; + goto HAD_EARLY_ERROR; + } + while (IS_DIGIT(ptr[pp])) + { + if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ + c = c*10 + (ptr[pp++] - CHAR_0); + } + if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR60; + ptr += pp; + goto HAD_EARLY_ERROR; + } + if (p->type == PSO_LIMH) limit_heap = c; + else if (p->type == PSO_LIMM) limit_match = c; + else limit_depth = c; + skipatstart += pp - skipatstart; + break; + } + break; /* Out of the table scan loop */ + } + } + if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ + } + } + +/* End of pattern-start options; advance to start of real regex. */ + +ptr += skipatstart; + +/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */ + +#ifndef SUPPORT_UNICODE +if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) + { + errorcode = ERR32; + goto HAD_EARLY_ERROR; + } +#endif + +/* Check UTF. We have the original options in 'options', with that value as +modified by (*UTF) etc in cb->external_options. The extra option +PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the +surrogate code points cannot be represented in UTF-16. */ + +utf = (cb.external_options & PCRE2_UTF) != 0; +if (utf) + { + if ((options & PCRE2_NEVER_UTF) != 0) + { + errorcode = ERR74; + goto HAD_EARLY_ERROR; + } + if ((options & PCRE2_NO_UTF_CHECK) == 0 && + (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) + goto HAD_ERROR; /* Offset was set by valid_utf() */ + +#if PCRE2_CODE_UNIT_WIDTH == 16 + if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0) + { + errorcode = ERR91; + goto HAD_EARLY_ERROR; + } +#endif + } + +/* Check UCP lockout. */ + +ucp = (cb.external_options & PCRE2_UCP) != 0; +if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0) + { + errorcode = ERR75; + goto HAD_EARLY_ERROR; + } + +/* Process the BSR setting. */ + +if (bsr == 0) bsr = ccontext->bsr_convention; + +/* Process the newline setting. */ + +if (newline == 0) newline = ccontext->newline_convention; +cb.nltype = NLTYPE_FIXED; +switch(newline) + { + case PCRE2_NEWLINE_CR: + cb.nllen = 1; + cb.nl[0] = CHAR_CR; + break; + + case PCRE2_NEWLINE_LF: + cb.nllen = 1; + cb.nl[0] = CHAR_NL; + break; + + case PCRE2_NEWLINE_NUL: + cb.nllen = 1; + cb.nl[0] = CHAR_NUL; + break; + + case PCRE2_NEWLINE_CRLF: + cb.nllen = 2; + cb.nl[0] = CHAR_CR; + cb.nl[1] = CHAR_NL; + break; + + case PCRE2_NEWLINE_ANY: + cb.nltype = NLTYPE_ANY; + break; + + case PCRE2_NEWLINE_ANYCRLF: + cb.nltype = NLTYPE_ANYCRLF; + break; + + default: + errorcode = ERR56; + goto HAD_EARLY_ERROR; + } + +/* Pre-scan the pattern to do two things: (1) Discover the named groups and +their numerical equivalents, so that this information is always available for +the remaining processing. (2) At the same time, parse the pattern and put a +processed version into the parsed_pattern vector. This has escapes interpreted +and comments removed (amongst other things). + +In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned +32-bit ints in the parsed pattern is bounded by the length of the pattern plus +one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is +set. The exceptional case is when running in 32-bit, non-UTF mode, when literal +characters greater than META_END (0x80000000) have to be coded as two units. In +this case, therefore, we scan the pattern to check for such values. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 +if (!utf) + { + PCRE2_SPTR p; + for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++; + } +#endif + +/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT +is set we have to assume a numerical callout (4 elements) for each character +plus one at the end. This is overkill, but memory is plentiful these days. For +many smaller patterns the vector on the stack (which was set up above) can be +used. */ + +parsed_size_needed = patlen - skipatstart + big32count; + +if ((ccontext->extra_options & + (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) + parsed_size_needed += 4; + +if ((options & PCRE2_AUTO_CALLOUT) != 0) + parsed_size_needed = (parsed_size_needed + 1) * 5; + +if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) + { + uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( + (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); + if (heap_parsed_pattern == NULL) + { + *errorptr = ERR21; + goto EXIT; + } + cb.parsed_pattern = heap_parsed_pattern; + } +cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1; + +/* Do the parsing scan. */ + +errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb); +if (errorcode != 0) goto HAD_CB_ERROR; + +/* Workspace is needed to remember information about numbered groups: whether a +group can match an empty string and what its fixed length is. This is done to +avoid the possibility of recursive references causing very long compile times +when checking these features. Unnumbered groups do not have this exposure since +they cannot be referenced. We use an indexed vector for this purpose. If there +are sufficiently few groups, the default vector on the stack, as set up above, +can be used. Otherwise we have to get/free a special vector. The vector must be +initialized to zero. */ + +if (cb.bracount >= GROUPINFO_DEFAULT_SIZE) + { + cb.groupinfo = ccontext->memctl.malloc( + (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data); + if (cb.groupinfo == NULL) + { + errorcode = ERR21; + cb.erroroffset = 0; + goto HAD_CB_ERROR; + } + } +memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t)); + +/* If there were any lookbehinds, scan the parsed pattern to figure out their +lengths. */ + +if (has_lookbehind) + { + int loopcount = 0; + errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount); + if (errorcode != 0) goto HAD_CB_ERROR; + } + +/* For debugging, there is a function that shows the parsed data vector. */ + +#ifdef DEBUG_SHOW_PARSED +fprintf(stderr, "+++ Pre-scan complete:\n"); +show_parsed(&cb); +#endif + +/* For debugging capturing information this code can be enabled. */ + +#ifdef DEBUG_SHOW_CAPTURES + { + named_group *ng = cb.named_groups; + fprintf(stderr, "+++Captures: %d\n", cb.bracount); + for (i = 0; i < cb.names_found; i++, ng++) + { + fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); + } + } +#endif + +/* Pretend to compile the pattern while actually just accumulating the amount +of memory required in the 'length' variable. This behaviour is triggered by +passing a non-NULL final argument to compile_regex(). We pass a block of +workspace (cworkspace) for it to compile parts of the pattern into; the +compiled code is discarded when it is no longer needed, so hopefully this +workspace will never overflow, though there is a test for its doing so. + +On error, errorcode will be set non-zero, so we don't need to look at the +result of the function. The initial options have been put into the cb block, +but we still have to pass a separate options variable (the first argument) +because the options may change as the pattern is processed. */ + +cb.erroroffset = patlen; /* For any subsequent errors that do not set it */ +pptr = cb.parsed_pattern; +code = cworkspace; +*code = OP_BRA; + +(void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr, + &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, + &length); + +if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */ + +/* This should be caught in compile_regex(), but just in case... */ + +if (length > MAX_PATTERN_SIZE) + { + errorcode = ERR20; + goto HAD_CB_ERROR; + } + +/* Compute the size of, and then get and initialize, the data block for storing +the compiled pattern and names table. Integer overflow should no longer be +possible because nowadays we limit the maximum value of cb.names_found and +cb.name_entry_size. */ + +re_blocksize = sizeof(pcre2_real_code) + + CU2BYTES(length + + (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size); +re = (pcre2_real_code *) + ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data); +if (re == NULL) + { + errorcode = ERR21; + goto HAD_CB_ERROR; + } + +/* The compiler may put padding at the end of the pcre2_real_code structure in +order to round it up to a multiple of 4 or 8 bytes. This means that when a +compiled pattern is copied (for example, when serialized) undefined bytes are +read, and this annoys debuggers such as valgrind. To avoid this, we explicitly +write to the last 8 bytes of the structure before setting the fields. */ + +memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8); +re->memctl = ccontext->memctl; +re->tables = tables; +re->executable_jit = NULL; +memset(re->start_bitmap, 0, 32 * sizeof(uint8_t)); +re->blocksize = re_blocksize; +re->magic_number = MAGIC_NUMBER; +re->compile_options = options; +re->overall_options = cb.external_options; +re->extra_options = ccontext->extra_options; +re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags; +re->limit_heap = limit_heap; +re->limit_match = limit_match; +re->limit_depth = limit_depth; +re->first_codeunit = 0; +re->last_codeunit = 0; +re->bsr_convention = bsr; +re->newline_convention = newline; +re->max_lookbehind = 0; +re->minlength = 0; +re->top_bracket = 0; +re->top_backref = 0; +re->name_entry_size = cb.name_entry_size; +re->name_count = cb.names_found; + +/* The basic block is immediately followed by the name table, and the compiled +code follows after that. */ + +codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + + re->name_entry_size * re->name_count; + +/* Update the compile data block for the actual compile. The starting points of +the name/number translation table and of the code are passed around in the +compile data block. The start/end pattern and initial options are already set +from the pre-compile phase, as is the name_entry_size field. */ + +cb.parens_depth = 0; +cb.assert_depth = 0; +cb.lastcapture = 0; +cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); +cb.start_code = codestart; +cb.req_varyopt = 0; +cb.had_accept = FALSE; +cb.had_pruneorskip = FALSE; +cb.open_caps = NULL; + +/* If any named groups were found, create the name/number table from the list +created in the pre-pass. */ + +if (cb.names_found > 0) + { + named_group *ng = cb.named_groups; + for (i = 0; i < cb.names_found; i++, ng++) + add_name_to_table(&cb, ng->name, ng->length, ng->number, i); + } + +/* Set up a starting, non-extracting bracket, then compile the expression. On +error, errorcode will be set non-zero, so we don't need to look at the result +of the function here. */ + +pptr = cb.parsed_pattern; +code = (PCRE2_UCHAR *)codestart; +*code = OP_BRA; +regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code, + &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, + &cb, NULL); +if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY; +re->top_bracket = cb.bracount; +re->top_backref = cb.top_backref; +re->max_lookbehind = cb.max_lookbehind; + +if (cb.had_accept) + { + reqcu = 0; /* Must disable after (*ACCEPT) */ + reqcuflags = REQ_NONE; + re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */ + } + +/* Fill in the final opcode and check for disastrous overflow. If no overflow, +but the estimated length exceeds the really used length, adjust the value of +re->blocksize, and if valgrind support is configured, mark the extra allocated +memory as unaddressable, so that any out-of-bound reads can be detected. */ + +*code++ = OP_END; +usedlength = code - codestart; +if (usedlength > length) errorcode = ERR23; else + { + re->blocksize -= CU2BYTES(length - usedlength); +#ifdef SUPPORT_VALGRIND + VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength)); +#endif + } + +/* Scan the pattern for recursion/subroutine calls and convert the group +numbers into offsets. Maintain a small cache so that repeated groups containing +recursions are efficiently handled. */ + +#define RSCAN_CACHE_SIZE 8 + +if (errorcode == 0 && cb.had_recurse) + { + PCRE2_UCHAR *rcode; + PCRE2_SPTR rgroup; + unsigned int ccount = 0; + int start = RSCAN_CACHE_SIZE; + recurse_cache rc[RSCAN_CACHE_SIZE]; + + for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); + rcode != NULL; + rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) + { + int p, groupnumber; + + groupnumber = (int)GET(rcode, 1); + if (groupnumber == 0) rgroup = codestart; else + { + PCRE2_SPTR search_from = codestart; + rgroup = NULL; + for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) + { + if (groupnumber == rc[p].groupnumber) + { + rgroup = rc[p].group; + break; + } + + /* Group n+1 must always start to the right of group n, so we can save + search time below when the new group number is greater than any of the + previously found groups. */ + + if (groupnumber > rc[p].groupnumber) search_from = rc[p].group; + } + + if (rgroup == NULL) + { + rgroup = PRIV(find_bracket)(search_from, utf, groupnumber); + if (rgroup == NULL) + { + errorcode = ERR53; + break; + } + if (--start < 0) start = RSCAN_CACHE_SIZE - 1; + rc[start].groupnumber = groupnumber; + rc[start].group = rgroup; + if (ccount < RSCAN_CACHE_SIZE) ccount++; + } + } + + PUT(rcode, 1, rgroup - codestart); + } + } + +/* In rare debugging situations we sometimes need to look at the compiled code +at this stage. */ + +#ifdef DEBUG_CALL_PRINTINT +pcre2_printint(re, stderr, TRUE); +fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); +#endif + +/* Unless disabled, check whether any single character iterators can be +auto-possessified. The function overwrites the appropriate opcode values, so +the type of the pointer must be cast. NOTE: the intermediate variable "temp" is +used in this code because at least one compiler gives a warning about loss of +"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the +function call. */ + +if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) + { + PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; + if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80; + } + +/* Failed to compile, or error while post-processing. */ + +if (errorcode != 0) goto HAD_CB_ERROR; + +/* Successful compile. If the anchored option was not passed, set it if +we can determine that the pattern is anchored by virtue of ^ characters or \A +or anything else, such as starting with non-atomic .* when DOTALL is set and +there are no occurrences of *PRUNE or *SKIP (though there is an option to +disable this case). */ + +if ((re->overall_options & PCRE2_ANCHORED) == 0 && + is_anchored(codestart, 0, &cb, 0, FALSE)) + re->overall_options |= PCRE2_ANCHORED; + +/* Set up the first code unit or startline flag, the required code unit, and +then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE +is set, as the data it would create will not be used. Note that a first code +unit (but not the startline flag) is useful for anchored patterns because it +can still give a quick "no match" and also avoid searching for a last code +unit. */ + +if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) + { + int minminlength = 0; /* For minimal minlength from first/required CU */ + + /* If we do not have a first code unit, see if there is one that is asserted + (these are not saved during the compile because they can cause conflicts with + actual literals that follow). */ + + if (firstcuflags >= REQ_NONE) + firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); + + /* Save the data for a first code unit. The existence of one means the + minimum length must be at least 1. */ + + if (firstcuflags < REQ_NONE) + { + re->first_codeunit = firstcu; + re->flags |= PCRE2_FIRSTSET; + minminlength++; + + /* Handle caseless first code units. */ + + if ((firstcuflags & REQ_CASELESS) != 0) + { + if (firstcu < 128 || (!utf && !ucp && firstcu < 255)) + { + if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; + } + + /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise. + In 8-bit UTF mode, codepoints in the range 128-255 are introductory code + points and cannot have another case, but if UCP is set they may do. */ + +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu) + re->flags |= PCRE2_FIRSTCASELESS; +#else + else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT && + UCD_OTHERCASE(firstcu) != firstcu) + re->flags |= PCRE2_FIRSTCASELESS; +#endif +#endif /* SUPPORT_UNICODE */ + } + } + + /* When there is no first code unit, for non-anchored patterns, see if we can + set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all + branches start with ^ and also when all branches start with non-atomic .* for + non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option + that disables this case.) */ + + else if ((re->overall_options & PCRE2_ANCHORED) == 0 && + is_startline(codestart, 0, &cb, 0, FALSE)) + re->flags |= PCRE2_STARTLINE; + + /* Handle the "required code unit", if one is set. In the UTF case we can + increment the minimum minimum length only if we are sure this really is a + different character and not a non-starting code unit of the first character, + because the minimum length count is in characters, not code units. */ + + if (reqcuflags < REQ_NONE) + { +#if PCRE2_CODE_UNIT_WIDTH == 16 + if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ + firstcuflags >= REQ_NONE || /* First not set */ + (firstcu & 0xf800) != 0xd800 || /* First not surrogate */ + (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */ +#elif PCRE2_CODE_UNIT_WIDTH == 8 + if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ + firstcuflags >= REQ_NONE || /* First not set */ + (firstcu & 0x80) == 0 || /* First is ASCII */ + (reqcu & 0x80) == 0) /* Req is ASCII */ +#endif + { + minminlength++; + } + + /* In the case of an anchored pattern, set up the value only if it follows + a variable length item in the pattern. */ + + if ((re->overall_options & PCRE2_ANCHORED) == 0 || + (reqcuflags & REQ_VARY) != 0) + { + re->last_codeunit = reqcu; + re->flags |= PCRE2_LASTSET; + + /* Handle caseless required code units as for first code units (above). */ + + if ((reqcuflags & REQ_CASELESS) != 0) + { + if (reqcu < 128 || (!utf && !ucp && reqcu < 255)) + { + if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; + } +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; +#else + else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT && + UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; +#endif +#endif /* SUPPORT_UNICODE */ + } + } + } + + /* Study the compiled pattern to set up information such as a bitmap of + starting code units and a minimum matching length. */ + + if (PRIV(study)(re) != 0) + { + errorcode = ERR31; + goto HAD_CB_ERROR; + } + + /* If study() set a bitmap of starting code units, it implies a minimum + length of at least one. */ + + if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0) + minminlength = 1; + + /* If the minimum length set (or not set) by study() is less than the minimum + implied by required code units, override it. */ + + if (re->minlength < minminlength) re->minlength = minminlength; + } /* End of start-of-match optimizations. */ + +/* Control ends up here in all cases. When running under valgrind, make a +pattern's terminating zero defined again. If memory was obtained for the parsed +version of the pattern, free it before returning. Also free the list of named +groups if a larger one had to be obtained, and likewise the group information +vector. */ + +EXIT: +#ifdef SUPPORT_VALGRIND +if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); +#endif +if (cb.parsed_pattern != stack_parsed_pattern) + ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); +if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) + ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); +if (cb.groupinfo != stack_groupinfo) + ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data); +return re; /* Will be NULL after an error */ + +/* Errors discovered in parse_regex() set the offset value in the compile +block. Errors discovered before it is called must compute it from the ptr +value. After parse_regex() is called, the offset in the compile block is set to +the end of the pattern, but certain errors in compile_regex() may reset it if +an offset is available in the parsed pattern. */ + +HAD_CB_ERROR: +ptr = pattern + cb.erroroffset; + +HAD_EARLY_ERROR: +*erroroffset = ptr - pattern; + +HAD_ERROR: +*errorptr = errorcode; +pcre2_code_free(re); +re = NULL; +goto EXIT; +} + +/* These #undefs are here to enable unity builds with CMake. */ + +#undef NLBLOCK /* Block containing newline information */ +#undef PSSTART /* Field containing processed string start */ +#undef PSEND /* Field containing processed string end */ + +/* End of pcre2_compile.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_config.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_config.c new file mode 100644 index 0000000000..548e218023 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_config.c @@ -0,0 +1,252 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +/* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes +its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to +be in code units. */ + +static int configured_link_size = LINK_SIZE; + +#include "regexp/pcre2/pcre2_internal.h" + +/* These macros are the standard way of turning unquoted text into C strings. +They allow macros like PCRE2_MAJOR to be defined without quotes, which is +convenient for user programs that want to test their values. */ + +#define STRING(a) # a +#define XSTRING(s) STRING(s) + + +/************************************************* +* Return info about what features are configured * +*************************************************/ + +/* If where is NULL, the length of memory required is returned. + +Arguments: + what what information is required + where where to put the information + +Returns: 0 if a numerical value is returned + >= 0 if a string value + PCRE2_ERROR_BADOPTION if "where" not recognized + or JIT target requested when JIT not enabled +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_config(uint32_t what, void *where) +{ +if (where == NULL) /* Requests a length */ + { + switch(what) + { + default: + return PCRE2_ERROR_BADOPTION; + + case PCRE2_CONFIG_BSR: + case PCRE2_CONFIG_COMPILED_WIDTHS: + case PCRE2_CONFIG_DEPTHLIMIT: + case PCRE2_CONFIG_HEAPLIMIT: + case PCRE2_CONFIG_JIT: + case PCRE2_CONFIG_LINKSIZE: + case PCRE2_CONFIG_MATCHLIMIT: + case PCRE2_CONFIG_NEVER_BACKSLASH_C: + case PCRE2_CONFIG_NEWLINE: + case PCRE2_CONFIG_PARENSLIMIT: + case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */ + case PCRE2_CONFIG_TABLES_LENGTH: + case PCRE2_CONFIG_UNICODE: + return sizeof(uint32_t); + + /* These are handled below */ + + case PCRE2_CONFIG_JITTARGET: + case PCRE2_CONFIG_UNICODE_VERSION: + case PCRE2_CONFIG_VERSION: + break; + } + } + +switch (what) + { + default: + return PCRE2_ERROR_BADOPTION; + + case PCRE2_CONFIG_BSR: +#ifdef BSR_ANYCRLF + *((uint32_t *)where) = PCRE2_BSR_ANYCRLF; +#else + *((uint32_t *)where) = PCRE2_BSR_UNICODE; +#endif + break; + + case PCRE2_CONFIG_COMPILED_WIDTHS: + *((uint32_t *)where) = 0 +#ifdef SUPPORT_PCRE2_8 + + 1 +#endif +#ifdef SUPPORT_PCRE2_16 + + 2 +#endif +#ifdef SUPPORT_PCRE2_32 + + 4 +#endif + ; + break; + + case PCRE2_CONFIG_DEPTHLIMIT: + *((uint32_t *)where) = MATCH_LIMIT_DEPTH; + break; + + case PCRE2_CONFIG_HEAPLIMIT: + *((uint32_t *)where) = HEAP_LIMIT; + break; + + case PCRE2_CONFIG_JIT: +#ifdef SUPPORT_JIT + *((uint32_t *)where) = 1; +#else + *((uint32_t *)where) = 0; +#endif + break; + + case PCRE2_CONFIG_JITTARGET: +#ifdef SUPPORT_JIT + { + const char *v = PRIV(jit_get_target)(); + return (int)(1 + ((where == NULL)? + strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); + } +#else + return PCRE2_ERROR_BADOPTION; +#endif + + case PCRE2_CONFIG_LINKSIZE: + *((uint32_t *)where) = (uint32_t)configured_link_size; + break; + + case PCRE2_CONFIG_MATCHLIMIT: + *((uint32_t *)where) = MATCH_LIMIT; + break; + + case PCRE2_CONFIG_NEWLINE: + *((uint32_t *)where) = NEWLINE_DEFAULT; + break; + + case PCRE2_CONFIG_NEVER_BACKSLASH_C: +#ifdef NEVER_BACKSLASH_C + *((uint32_t *)where) = 1; +#else + *((uint32_t *)where) = 0; +#endif + break; + + case PCRE2_CONFIG_PARENSLIMIT: + *((uint32_t *)where) = PARENS_NEST_LIMIT; + break; + + /* This is now obsolete. The stack is no longer used via recursion for + handling backtracking in pcre2_match(). */ + + case PCRE2_CONFIG_STACKRECURSE: + *((uint32_t *)where) = 0; + break; + + case PCRE2_CONFIG_TABLES_LENGTH: + *((uint32_t *)where) = TABLES_LENGTH; + break; + + case PCRE2_CONFIG_UNICODE_VERSION: + { +#if defined SUPPORT_UNICODE + const char *v = PRIV(unicode_version); +#else + const char *v = "Unicode not supported"; +#endif + return (int)(1 + ((where == NULL)? + strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); + } + break; + + case PCRE2_CONFIG_UNICODE: +#if defined SUPPORT_UNICODE + *((uint32_t *)where) = 1; +#else + *((uint32_t *)where) = 0; +#endif + break; + + /* The hackery in setting "v" below is to cope with the case when + PCRE2_PRERELEASE is set to an empty string (which it is for real releases). + If the second alternative is used in this case, it does not leave a space + before the date. On the other hand, if all four macros are put into a single + XSTRING when PCRE2_PRERELEASE is not empty, an unwanted space is inserted. + There are problems using an "obvious" approach like this: + + XSTRING(PCRE2_MAJOR) "." XSTRING(PCRE_MINOR) + XSTRING(PCRE2_PRERELEASE) " " XSTRING(PCRE_DATE) + + because, when PCRE2_PRERELEASE is empty, this leads to an attempted expansion + of STRING(). The C standard states: "If (before argument substitution) any + argument consists of no preprocessing tokens, the behavior is undefined." It + turns out the gcc treats this case as a single empty string - which is what + we really want - but Visual C grumbles about the lack of an argument for the + macro. Unfortunately, both are within their rights. As there seems to be no + way to test for a macro's value being empty at compile time, we have to + resort to a runtime test. */ + + case PCRE2_CONFIG_VERSION: + { + const char *v = (XSTRING(Z PCRE2_PRERELEASE)[1] == 0)? + XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) : + XSTRING(PCRE2_MAJOR.PCRE2_MINOR) XSTRING(PCRE2_PRERELEASE PCRE2_DATE); + return (int)(1 + ((where == NULL)? + strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); + } + } + +return 0; +} + +/* End of pcre2_config.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_context.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_context.c new file mode 100644 index 0000000000..3e74935ab0 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_context.c @@ -0,0 +1,494 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + + +/************************************************* +* Default malloc/free functions * +*************************************************/ + +/* Ignore the "user data" argument in each case. */ + +static void *default_malloc(size_t size, void *data) +{ +(void)data; +return malloc(size); +} + + +static void default_free(void *block, void *data) +{ +(void)data; +free(block); +} + + + +/************************************************* +* Get a block and save memory control * +*************************************************/ + +/* This internal function is called to get a block of memory in which the +memory control data is to be stored at the start for future use. + +Arguments: + size amount of memory required + memctl pointer to a memctl block or NULL + +Returns: pointer to memory or NULL on failure +*/ + +extern void * +PRIV(memctl_malloc)(size_t size, pcre2_memctl *memctl) +{ +pcre2_memctl *newmemctl; +void *yield = (memctl == NULL)? malloc(size) : + memctl->malloc(size, memctl->memory_data); +if (yield == NULL) return NULL; +newmemctl = (pcre2_memctl *)yield; +if (memctl == NULL) + { + newmemctl->malloc = default_malloc; + newmemctl->free = default_free; + newmemctl->memory_data = NULL; + } +else *newmemctl = *memctl; +return yield; +} + + + +/************************************************* +* Create and initialize contexts * +*************************************************/ + +/* Initializing for compile and match contexts is done in separate, private +functions so that these can be called from functions such as pcre2_compile() +when an external context is not supplied. The initializing functions have an +option to set up default memory management. */ + +PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION +pcre2_general_context_create(void *(*private_malloc)(size_t, void *), + void (*private_free)(void *, void *), void *memory_data) +{ +pcre2_general_context *gcontext; +if (private_malloc == NULL) private_malloc = default_malloc; +if (private_free == NULL) private_free = default_free; +gcontext = private_malloc(sizeof(pcre2_real_general_context), memory_data); +if (gcontext == NULL) return NULL; +gcontext->memctl.malloc = private_malloc; +gcontext->memctl.free = private_free; +gcontext->memctl.memory_data = memory_data; +return gcontext; +} + + +/* A default compile context is set up to save having to initialize at run time +when no context is supplied to the compile function. */ + +const pcre2_compile_context PRIV(default_compile_context) = { + { default_malloc, default_free, NULL }, /* Default memory handling */ + NULL, /* Stack guard */ + NULL, /* Stack guard data */ + PRIV(default_tables), /* Character tables */ + PCRE2_UNSET, /* Max pattern length */ + BSR_DEFAULT, /* Backslash R default */ + NEWLINE_DEFAULT, /* Newline convention */ + PARENS_NEST_LIMIT, /* As it says */ + 0 }; /* Extra options */ + +/* The create function copies the default into the new memory, but must +override the default memory handling functions if a gcontext was provided. */ + +PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION +pcre2_compile_context_create(pcre2_general_context *gcontext) +{ +pcre2_compile_context *ccontext = PRIV(memctl_malloc)( + sizeof(pcre2_real_compile_context), (pcre2_memctl *)gcontext); +if (ccontext == NULL) return NULL; +*ccontext = PRIV(default_compile_context); +if (gcontext != NULL) + *((pcre2_memctl *)ccontext) = *((pcre2_memctl *)gcontext); +return ccontext; +} + + +/* A default match context is set up to save having to initialize at run time +when no context is supplied to a match function. */ + +const pcre2_match_context PRIV(default_match_context) = { + { default_malloc, default_free, NULL }, +#ifdef SUPPORT_JIT + NULL, /* JIT callback */ + NULL, /* JIT callback data */ +#endif + NULL, /* Callout function */ + NULL, /* Callout data */ + NULL, /* Substitute callout function */ + NULL, /* Substitute callout data */ + PCRE2_UNSET, /* Offset limit */ + HEAP_LIMIT, + MATCH_LIMIT, + MATCH_LIMIT_DEPTH }; + +/* The create function copies the default into the new memory, but must +override the default memory handling functions if a gcontext was provided. */ + +PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION +pcre2_match_context_create(pcre2_general_context *gcontext) +{ +pcre2_match_context *mcontext = PRIV(memctl_malloc)( + sizeof(pcre2_real_match_context), (pcre2_memctl *)gcontext); +if (mcontext == NULL) return NULL; +*mcontext = PRIV(default_match_context); +if (gcontext != NULL) + *((pcre2_memctl *)mcontext) = *((pcre2_memctl *)gcontext); +return mcontext; +} + + +/* A default convert context is set up to save having to initialize at run time +when no context is supplied to the convert function. */ + +const pcre2_convert_context PRIV(default_convert_context) = { + { default_malloc, default_free, NULL }, /* Default memory handling */ +#ifdef _WIN32 + CHAR_BACKSLASH, /* Default path separator */ + CHAR_GRAVE_ACCENT /* Default escape character */ +#else /* Not Windows */ + CHAR_SLASH, /* Default path separator */ + CHAR_BACKSLASH /* Default escape character */ +#endif + }; + +/* The create function copies the default into the new memory, but must +override the default memory handling functions if a gcontext was provided. */ + +PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION +pcre2_convert_context_create(pcre2_general_context *gcontext) +{ +pcre2_convert_context *ccontext = PRIV(memctl_malloc)( + sizeof(pcre2_real_convert_context), (pcre2_memctl *)gcontext); +if (ccontext == NULL) return NULL; +*ccontext = PRIV(default_convert_context); +if (gcontext != NULL) + *((pcre2_memctl *)ccontext) = *((pcre2_memctl *)gcontext); +return ccontext; +} + + +/************************************************* +* Context copy functions * +*************************************************/ + +PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION +pcre2_general_context_copy(pcre2_general_context *gcontext) +{ +pcre2_general_context *new = + gcontext->memctl.malloc(sizeof(pcre2_real_general_context), + gcontext->memctl.memory_data); +if (new == NULL) return NULL; +memcpy(new, gcontext, sizeof(pcre2_real_general_context)); +return new; +} + + +PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION +pcre2_compile_context_copy(pcre2_compile_context *ccontext) +{ +pcre2_compile_context *new = + ccontext->memctl.malloc(sizeof(pcre2_real_compile_context), + ccontext->memctl.memory_data); +if (new == NULL) return NULL; +memcpy(new, ccontext, sizeof(pcre2_real_compile_context)); +return new; +} + + +PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION +pcre2_match_context_copy(pcre2_match_context *mcontext) +{ +pcre2_match_context *new = + mcontext->memctl.malloc(sizeof(pcre2_real_match_context), + mcontext->memctl.memory_data); +if (new == NULL) return NULL; +memcpy(new, mcontext, sizeof(pcre2_real_match_context)); +return new; +} + + + +PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION +pcre2_convert_context_copy(pcre2_convert_context *ccontext) +{ +pcre2_convert_context *new = + ccontext->memctl.malloc(sizeof(pcre2_real_convert_context), + ccontext->memctl.memory_data); +if (new == NULL) return NULL; +memcpy(new, ccontext, sizeof(pcre2_real_convert_context)); +return new; +} + + +/************************************************* +* Context free functions * +*************************************************/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_general_context_free(pcre2_general_context *gcontext) +{ +if (gcontext != NULL) + gcontext->memctl.free(gcontext, gcontext->memctl.memory_data); +} + + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_compile_context_free(pcre2_compile_context *ccontext) +{ +if (ccontext != NULL) + ccontext->memctl.free(ccontext, ccontext->memctl.memory_data); +} + + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_match_context_free(pcre2_match_context *mcontext) +{ +if (mcontext != NULL) + mcontext->memctl.free(mcontext, mcontext->memctl.memory_data); +} + + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_convert_context_free(pcre2_convert_context *ccontext) +{ +if (ccontext != NULL) + ccontext->memctl.free(ccontext, ccontext->memctl.memory_data); +} + + +/************************************************* +* Set values in contexts * +*************************************************/ + +/* All these functions return 0 for success or PCRE2_ERROR_BADDATA if invalid +data is given. Only some of the functions are able to test the validity of the +data. */ + + +/* ------------ Compile context ------------ */ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_character_tables(pcre2_compile_context *ccontext, + const uint8_t *tables) +{ +ccontext->tables = tables; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value) +{ +switch(value) + { + case PCRE2_BSR_ANYCRLF: + case PCRE2_BSR_UNICODE: + ccontext->bsr_convention = value; + return 0; + + default: + return PCRE2_ERROR_BADDATA; + } +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE length) +{ +ccontext->max_pattern_length = length; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t newline) +{ +switch(newline) + { + case PCRE2_NEWLINE_CR: + case PCRE2_NEWLINE_LF: + case PCRE2_NEWLINE_CRLF: + case PCRE2_NEWLINE_ANY: + case PCRE2_NEWLINE_ANYCRLF: + case PCRE2_NEWLINE_NUL: + ccontext->newline_convention = newline; + return 0; + + default: + return PCRE2_ERROR_BADDATA; + } +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t limit) +{ +ccontext->parens_nest_limit = limit; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t options) +{ +ccontext->extra_options = options; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, + int (*guard)(uint32_t, void *), void *user_data) +{ +ccontext->stack_guard = guard; +ccontext->stack_guard_data = user_data; +return 0; +} + + +/* ------------ Match context ------------ */ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_callout(pcre2_match_context *mcontext, + int (*callout)(pcre2_callout_block *, void *), void *callout_data) +{ +mcontext->callout = callout; +mcontext->callout_data = callout_data; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_substitute_callout(pcre2_match_context *mcontext, + int (*substitute_callout)(pcre2_substitute_callout_block *, void *), + void *substitute_callout_data) +{ +mcontext->substitute_callout = substitute_callout; +mcontext->substitute_callout_data = substitute_callout_data; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit) +{ +mcontext->heap_limit = limit; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t limit) +{ +mcontext->match_limit = limit; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t limit) +{ +mcontext->depth_limit = limit; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE limit) +{ +mcontext->offset_limit = limit; +return 0; +} + +/* These functions became obsolete at release 10.30. The first is kept as a +synonym for backwards compatibility. The second now does nothing. Exclude both +from coverage reports. */ + +/* LCOV_EXCL_START */ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t limit) +{ +return pcre2_set_depth_limit(mcontext, limit); +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_recursion_memory_management(pcre2_match_context *mcontext, + void *(*mymalloc)(size_t, void *), void (*myfree)(void *, void *), + void *mydata) +{ +(void)mcontext; +(void)mymalloc; +(void)myfree; +(void)mydata; +return 0; +} + +/* LCOV_EXCL_STOP */ + + +/* ------------ Convert context ------------ */ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_glob_separator(pcre2_convert_context *ccontext, uint32_t separator) +{ +if (separator != CHAR_SLASH && separator != CHAR_BACKSLASH && + separator != CHAR_DOT) return PCRE2_ERROR_BADDATA; +ccontext->glob_separator = separator; +return 0; +} + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_glob_escape(pcre2_convert_context *ccontext, uint32_t escape) +{ +if (escape > 255 || (escape != 0 && !ispunct(escape))) + return PCRE2_ERROR_BADDATA; +ccontext->glob_escape = escape; +return 0; +} + +/* End of pcre2_context.c */ + diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_convert.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_convert.c new file mode 100644 index 0000000000..53e1072788 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_convert.c @@ -0,0 +1,1181 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + +#define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \ + PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED) + +#define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \ + PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \ + PCRE2_CONVERT_GLOB_NO_STARSTAR| \ + TYPE_OPTIONS) + +#define DUMMY_BUFFER_SIZE 100 + +/* Generated pattern fragments */ + +#define STR_BACKSLASH_A STR_BACKSLASH STR_A +#define STR_BACKSLASH_z STR_BACKSLASH STR_z +#define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET +#define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN +#define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS +#define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS +#define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS + +/* States for POSIX processing */ + +enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET, + POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED }; + +/* Macro to add a character string to the output buffer, checking for overflow. */ + +#define PUTCHARS(string) \ + { \ + for (s = (char *)(string); *s != 0; s++) \ + { \ + if (p >= endp) return PCRE2_ERROR_NOMEMORY; \ + *p++ = *s; \ + } \ + } + +/* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */ + +static const char *pcre2_escaped_literals = + STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS + STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN + STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET + STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET + STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS; + +/* Recognized escaped metacharacters in POSIX basic patterns. */ + +static const char *posix_meta_escapes = + STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS + STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET + STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9; + + + +/************************************************* +* Convert a POSIX pattern * +*************************************************/ + +/* This function handles both basic and extended POSIX patterns. + +Arguments: + pattype the pattern type + pattern the pattern + plength length in code units + utf TRUE if UTF + use_buffer where to put the output + use_length length of use_buffer + bufflenptr where to put the used length + dummyrun TRUE if a dummy run + ccontext the convert context + +Returns: 0 => success + !0 => error code +*/ + +static int +convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength, + BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, + PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) +{ +char *s; +PCRE2_SPTR posix = pattern; +PCRE2_UCHAR *p = use_buffer; +PCRE2_UCHAR *pp = p; +PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */ +PCRE2_SIZE convlength = 0; + +uint32_t bracount = 0; +uint32_t posix_state = POSIX_START_REGEX; +uint32_t lastspecial = 0; +BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0; +BOOL nextisliteral = FALSE; + +(void)utf; /* Not used when Unicode not supported */ +(void)ccontext; /* Not currently used */ + +/* Initialize default for error offset as end of input. */ + +*bufflenptr = plength; +PUTCHARS(STR_STAR_NUL); + +/* Now scan the input. */ + +while (plength > 0) + { + uint32_t c, sc; + int clength = 1; + + /* Add in the length of the last item, then, if in the dummy run, pull the + pointer back to the start of the (temporary) buffer and then remember the + start of the next item. */ + + convlength += p - pp; + if (dummyrun) p = use_buffer; + pp = p; + + /* Pick up the next character */ + +#ifndef SUPPORT_UNICODE + c = *posix; +#else + GETCHARLENTEST(c, posix, clength); +#endif + posix += clength; + plength -= clength; + + sc = nextisliteral? 0 : c; + nextisliteral = FALSE; + + /* Handle a character within a class. */ + + if (posix_state >= POSIX_CLASS_NOT_STARTED) + { + if (c == CHAR_RIGHT_SQUARE_BRACKET) + { + PUTCHARS(STR_RIGHT_SQUARE_BRACKET); + posix_state = POSIX_NOT_BRACKET; + } + + /* Not the end of the class */ + + else + { + switch (posix_state) + { + case POSIX_CLASS_STARTED: + if (c <= 127 && islower(c)) break; /* Remain in started state */ + posix_state = POSIX_CLASS_NOT_STARTED; + if (c == CHAR_COLON && plength > 0 && + *posix == CHAR_RIGHT_SQUARE_BRACKET) + { + PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET); + plength--; + posix++; + continue; /* With next character after :] */ + } + /* Fall through */ + + case POSIX_CLASS_NOT_STARTED: + if (c == CHAR_LEFT_SQUARE_BRACKET) + posix_state = POSIX_CLASS_STARTING; + break; + + case POSIX_CLASS_STARTING: + if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED; + break; + } + + if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH); + if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; + memcpy(p, posix - clength, CU2BYTES(clength)); + p += clength; + } + } + + /* Handle a character not within a class. */ + + else switch(sc) + { + case CHAR_LEFT_SQUARE_BRACKET: + PUTCHARS(STR_LEFT_SQUARE_BRACKET); + +#ifdef NEVER + /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does + support) but they are not part of POSIX 1003.1. */ + + if (plength >= 6) + { + if (posix[0] == CHAR_LEFT_SQUARE_BRACKET && + posix[1] == CHAR_COLON && + (posix[2] == CHAR_LESS_THAN_SIGN || + posix[2] == CHAR_GREATER_THAN_SIGN) && + posix[3] == CHAR_COLON && + posix[4] == CHAR_RIGHT_SQUARE_BRACKET && + posix[5] == CHAR_RIGHT_SQUARE_BRACKET) + { + if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY; + memcpy(p, posix, CU2BYTES(6)); + p += 6; + posix += 6; + plength -= 6; + continue; /* With next character */ + } + } +#endif + + /* Handle start of "normal" character classes */ + + posix_state = POSIX_CLASS_NOT_STARTED; + + /* Handle ^ and ] as first characters */ + + if (plength > 0) + { + if (*posix == CHAR_CIRCUMFLEX_ACCENT) + { + posix++; + plength--; + PUTCHARS(STR_CIRCUMFLEX_ACCENT); + } + if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET) + { + posix++; + plength--; + PUTCHARS(STR_RIGHT_SQUARE_BRACKET); + } + } + break; + + case CHAR_BACKSLASH: + if (plength == 0) return PCRE2_ERROR_END_BACKSLASH; + if (extended) nextisliteral = TRUE; else + { + if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL) + { + if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH); + if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; + lastspecial = *p++ = *posix++; + plength--; + } + else nextisliteral = TRUE; + } + break; + + case CHAR_RIGHT_PARENTHESIS: + if (!extended || bracount == 0) goto ESCAPE_LITERAL; + bracount--; + goto COPY_SPECIAL; + + case CHAR_LEFT_PARENTHESIS: + bracount++; + /* Fall through */ + + case CHAR_QUESTION_MARK: + case CHAR_PLUS: + case CHAR_LEFT_CURLY_BRACKET: + case CHAR_RIGHT_CURLY_BRACKET: + case CHAR_VERTICAL_LINE: + if (!extended) goto ESCAPE_LITERAL; + /* Fall through */ + + case CHAR_DOT: + case CHAR_DOLLAR_SIGN: + posix_state = POSIX_NOT_BRACKET; + COPY_SPECIAL: + lastspecial = c; + if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; + *p++ = c; + break; + + case CHAR_ASTERISK: + if (lastspecial != CHAR_ASTERISK) + { + if (!extended && (posix_state < POSIX_NOT_BRACKET || + lastspecial == CHAR_LEFT_PARENTHESIS)) + goto ESCAPE_LITERAL; + goto COPY_SPECIAL; + } + break; /* Ignore second and subsequent asterisks */ + + case CHAR_CIRCUMFLEX_ACCENT: + if (extended) goto COPY_SPECIAL; + if (posix_state == POSIX_START_REGEX || + lastspecial == CHAR_LEFT_PARENTHESIS) + { + posix_state = POSIX_ANCHORED; + goto COPY_SPECIAL; + } + /* Fall through */ + + default: + if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) + { + ESCAPE_LITERAL: + PUTCHARS(STR_BACKSLASH); + } + lastspecial = 0xff; /* Indicates nothing special */ + if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; + memcpy(p, posix - clength, CU2BYTES(clength)); + p += clength; + posix_state = POSIX_NOT_BRACKET; + break; + } + } + +if (posix_state >= POSIX_CLASS_NOT_STARTED) + return PCRE2_ERROR_MISSING_SQUARE_BRACKET; +convlength += p - pp; /* Final segment */ +*bufflenptr = convlength; +*p++ = 0; +return 0; +} + + +/************************************************* +* Convert a glob pattern * +*************************************************/ + +/* Context for writing the output into a buffer. */ + +typedef struct pcre2_output_context { + PCRE2_UCHAR *output; /* current output position */ + PCRE2_SPTR output_end; /* output end */ + PCRE2_SIZE output_size; /* size of the output */ + uint8_t out_str[8]; /* string copied to the output */ +} pcre2_output_context; + + +/* Write a character into the output. + +Arguments: + out output context + chr the next character +*/ + +static void +convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr) +{ +out->output_size++; + +if (out->output < out->output_end) + *out->output++ = chr; +} + + +/* Write a string into the output. + +Arguments: + out output context + length length of out->out_str +*/ + +static void +convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length) +{ +uint8_t *out_str = out->out_str; +PCRE2_UCHAR *output = out->output; +PCRE2_SPTR output_end = out->output_end; +PCRE2_SIZE output_size = out->output_size; + +do + { + output_size++; + + if (output < output_end) + *output++ = *out_str++; + } +while (--length != 0); + +out->output = output; +out->output_size = output_size; +} + + +/* Prints the separator into the output. + +Arguments: + out output context + separator glob separator + with_escape backslash is needed before separator +*/ + +static void +convert_glob_print_separator(pcre2_output_context *out, + PCRE2_UCHAR separator, BOOL with_escape) +{ +if (with_escape) + convert_glob_write(out, CHAR_BACKSLASH); + +convert_glob_write(out, separator); +} + + +/* Prints a wildcard into the output. + +Arguments: + out output context + separator glob separator + with_escape backslash is needed before separator +*/ + +static void +convert_glob_print_wildcard(pcre2_output_context *out, + PCRE2_UCHAR separator, BOOL with_escape) +{ +out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; +out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; +convert_glob_write_str(out, 2); + +convert_glob_print_separator(out, separator, with_escape); + +convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); +} + + +/* Parse a posix class. + +Arguments: + from starting point of scanning the range + pattern_end end of pattern + out output context + +Returns: >0 => class index + 0 => malformed class +*/ + +static int +convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, + pcre2_output_context *out) +{ +static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:" + "graph:lower:print:punct:space:upper:word:xdigit:"; +PCRE2_SPTR start = *from + 1; +PCRE2_SPTR pattern = start; +const char *class_ptr; +PCRE2_UCHAR c; +int class_index; + +while (TRUE) + { + if (pattern >= pattern_end) return 0; + + c = *pattern++; + + if (c < CHAR_a || c > CHAR_z) break; + } + +if (c != CHAR_COLON || pattern >= pattern_end || + *pattern != CHAR_RIGHT_SQUARE_BRACKET) + return 0; + +class_ptr = posix_classes; +class_index = 1; + +while (TRUE) + { + if (*class_ptr == CHAR_NUL) return 0; + + pattern = start; + + while (*pattern == (PCRE2_UCHAR) *class_ptr) + { + if (*pattern == CHAR_COLON) + { + pattern += 2; + start -= 2; + + do convert_glob_write(out, *start++); while (start < pattern); + + *from = pattern; + return class_index; + } + pattern++; + class_ptr++; + } + + while (*class_ptr != CHAR_COLON) class_ptr++; + class_ptr++; + class_index++; + } +} + +/* Checks whether the character is in the class. + +Arguments: + class_index class index + c character + +Returns: !0 => character is found in the class + 0 => otherwise +*/ + +static BOOL +convert_glob_char_in_class(int class_index, PCRE2_UCHAR c) +{ +switch (class_index) + { + case 1: return isalnum(c); + case 2: return isalpha(c); + case 3: return 1; + case 4: return c == CHAR_HT || c == CHAR_SPACE; + case 5: return iscntrl(c); + case 6: return isdigit(c); + case 7: return isgraph(c); + case 8: return islower(c); + case 9: return isprint(c); + case 10: return ispunct(c); + case 11: return isspace(c); + case 12: return isupper(c); + case 13: return isalnum(c) || c == CHAR_UNDERSCORE; + default: return isxdigit(c); + } +} + +/* Parse a range of characters. + +Arguments: + from starting point of scanning the range + pattern_end end of pattern + out output context + separator glob separator + with_escape backslash is needed before separator + +Returns: 0 => success + !0 => error code +*/ + +static int +convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, + pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator, + BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep) +{ +BOOL is_negative = FALSE; +BOOL separator_seen = FALSE; +BOOL has_prev_c; +PCRE2_SPTR pattern = *from; +PCRE2_SPTR char_start = NULL; +uint32_t c, prev_c; +int len, class_index; + +(void)utf; /* Avoid compiler warning. */ + +if (pattern >= pattern_end) + { + *from = pattern; + return PCRE2_ERROR_MISSING_SQUARE_BRACKET; + } + +if (*pattern == CHAR_EXCLAMATION_MARK + || *pattern == CHAR_CIRCUMFLEX_ACCENT) + { + pattern++; + + if (pattern >= pattern_end) + { + *from = pattern; + return PCRE2_ERROR_MISSING_SQUARE_BRACKET; + } + + is_negative = TRUE; + + out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; + out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; + len = 2; + + if (!no_wildsep) + { + if (with_escape) + { + out->out_str[len] = CHAR_BACKSLASH; + len++; + } + out->out_str[len] = (uint8_t) separator; + } + + convert_glob_write_str(out, len + 1); + } +else + convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET); + +has_prev_c = FALSE; +prev_c = 0; + +if (*pattern == CHAR_RIGHT_SQUARE_BRACKET) + { + out->out_str[0] = CHAR_BACKSLASH; + out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET; + convert_glob_write_str(out, 2); + has_prev_c = TRUE; + prev_c = CHAR_RIGHT_SQUARE_BRACKET; + pattern++; + } + +while (pattern < pattern_end) + { + char_start = pattern; + GETCHARINCTEST(c, pattern); + + if (c == CHAR_RIGHT_SQUARE_BRACKET) + { + convert_glob_write(out, c); + + if (!is_negative && !no_wildsep && separator_seen) + { + out->out_str[0] = CHAR_LEFT_PARENTHESIS; + out->out_str[1] = CHAR_QUESTION_MARK; + out->out_str[2] = CHAR_LESS_THAN_SIGN; + out->out_str[3] = CHAR_EXCLAMATION_MARK; + convert_glob_write_str(out, 4); + + convert_glob_print_separator(out, separator, with_escape); + convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); + } + + *from = pattern; + return 0; + } + + if (pattern >= pattern_end) break; + + if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) + { + *from = pattern; + class_index = convert_glob_parse_class(from, pattern_end, out); + + if (class_index != 0) + { + pattern = *from; + + has_prev_c = FALSE; + prev_c = 0; + + if (!is_negative && + convert_glob_char_in_class (class_index, separator)) + separator_seen = TRUE; + continue; + } + } + else if (c == CHAR_MINUS && has_prev_c && + *pattern != CHAR_RIGHT_SQUARE_BRACKET) + { + convert_glob_write(out, CHAR_MINUS); + + char_start = pattern; + GETCHARINCTEST(c, pattern); + + if (pattern >= pattern_end) break; + + if (escape != 0 && c == escape) + { + char_start = pattern; + GETCHARINCTEST(c, pattern); + } + else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) + { + *from = pattern; + return PCRE2_ERROR_CONVERT_SYNTAX; + } + + if (prev_c > c) + { + *from = pattern; + return PCRE2_ERROR_CONVERT_SYNTAX; + } + + if (prev_c < separator && separator < c) separator_seen = TRUE; + + has_prev_c = FALSE; + prev_c = 0; + } + else + { + if (escape != 0 && c == escape) + { + char_start = pattern; + GETCHARINCTEST(c, pattern); + + if (pattern >= pattern_end) break; + } + + has_prev_c = TRUE; + prev_c = c; + } + + if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET || + c == CHAR_BACKSLASH || c == CHAR_MINUS) + convert_glob_write(out, CHAR_BACKSLASH); + + if (c == separator) separator_seen = TRUE; + + do convert_glob_write(out, *char_start++); while (char_start < pattern); + } + +*from = pattern; +return PCRE2_ERROR_MISSING_SQUARE_BRACKET; +} + + +/* Prints a (*COMMIT) into the output. + +Arguments: + out output context +*/ + +static void +convert_glob_print_commit(pcre2_output_context *out) +{ +out->out_str[0] = CHAR_LEFT_PARENTHESIS; +out->out_str[1] = CHAR_ASTERISK; +out->out_str[2] = CHAR_C; +out->out_str[3] = CHAR_O; +out->out_str[4] = CHAR_M; +out->out_str[5] = CHAR_M; +out->out_str[6] = CHAR_I; +out->out_str[7] = CHAR_T; +convert_glob_write_str(out, 8); +convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); +} + + +/* Bash glob converter. + +Arguments: + pattype the pattern type + pattern the pattern + plength length in code units + utf TRUE if UTF + use_buffer where to put the output + use_length length of use_buffer + bufflenptr where to put the used length + dummyrun TRUE if a dummy run + ccontext the convert context + +Returns: 0 => success + !0 => error code +*/ + +static int +convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength, + BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, + PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) +{ +pcre2_output_context out; +PCRE2_SPTR pattern_start = pattern; +PCRE2_SPTR pattern_end = pattern + plength; +PCRE2_UCHAR separator = ccontext->glob_separator; +PCRE2_UCHAR escape = ccontext->glob_escape; +PCRE2_UCHAR c; +BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0; +BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0; +BOOL in_atomic = FALSE; +BOOL after_starstar = FALSE; +BOOL no_slash_z = FALSE; +BOOL with_escape, is_start, after_separator; +int result = 0; + +(void)utf; /* Avoid compiler warning. */ + +#ifdef SUPPORT_UNICODE +if (utf && (separator >= 128 || escape >= 128)) + { + /* Currently only ASCII characters are supported. */ + *bufflenptr = 0; + return PCRE2_ERROR_CONVERT_SYNTAX; + } +#endif + +with_escape = strchr(pcre2_escaped_literals, separator) != NULL; + +/* Initialize default for error offset as end of input. */ +out.output = use_buffer; +out.output_end = use_buffer + use_length; +out.output_size = 0; + +out.out_str[0] = CHAR_LEFT_PARENTHESIS; +out.out_str[1] = CHAR_QUESTION_MARK; +out.out_str[2] = CHAR_s; +out.out_str[3] = CHAR_RIGHT_PARENTHESIS; +convert_glob_write_str(&out, 4); + +is_start = TRUE; + +if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK) + { + if (no_wildsep) + is_start = FALSE; + else if (!no_starstar && pattern + 1 < pattern_end && + pattern[1] == CHAR_ASTERISK) + is_start = FALSE; + } + +if (is_start) + { + out.out_str[0] = CHAR_BACKSLASH; + out.out_str[1] = CHAR_A; + convert_glob_write_str(&out, 2); + } + +while (pattern < pattern_end) + { + c = *pattern++; + + if (c == CHAR_ASTERISK) + { + is_start = pattern == pattern_start + 1; + + if (in_atomic) + { + convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); + in_atomic = FALSE; + } + + if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK) + { + after_separator = is_start || (pattern[-2] == separator); + + do pattern++; while (pattern < pattern_end && + *pattern == CHAR_ASTERISK); + + if (pattern >= pattern_end) + { + no_slash_z = TRUE; + break; + } + + after_starstar = TRUE; + + if (after_separator && escape != 0 && *pattern == escape && + pattern + 1 < pattern_end && pattern[1] == separator) + pattern++; + + if (is_start) + { + if (*pattern != separator) continue; + + out.out_str[0] = CHAR_LEFT_PARENTHESIS; + out.out_str[1] = CHAR_QUESTION_MARK; + out.out_str[2] = CHAR_COLON; + out.out_str[3] = CHAR_BACKSLASH; + out.out_str[4] = CHAR_A; + out.out_str[5] = CHAR_VERTICAL_LINE; + convert_glob_write_str(&out, 6); + + convert_glob_print_separator(&out, separator, with_escape); + convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); + + pattern++; + continue; + } + + convert_glob_print_commit(&out); + + if (!after_separator || *pattern != separator) + { + out.out_str[0] = CHAR_DOT; + out.out_str[1] = CHAR_ASTERISK; + out.out_str[2] = CHAR_QUESTION_MARK; + convert_glob_write_str(&out, 3); + continue; + } + + out.out_str[0] = CHAR_LEFT_PARENTHESIS; + out.out_str[1] = CHAR_QUESTION_MARK; + out.out_str[2] = CHAR_COLON; + out.out_str[3] = CHAR_DOT; + out.out_str[4] = CHAR_ASTERISK; + out.out_str[5] = CHAR_QUESTION_MARK; + + convert_glob_write_str(&out, 6); + + convert_glob_print_separator(&out, separator, with_escape); + + out.out_str[0] = CHAR_RIGHT_PARENTHESIS; + out.out_str[1] = CHAR_QUESTION_MARK; + out.out_str[2] = CHAR_QUESTION_MARK; + convert_glob_write_str(&out, 3); + + pattern++; + continue; + } + + if (pattern < pattern_end && *pattern == CHAR_ASTERISK) + { + do pattern++; while (pattern < pattern_end && + *pattern == CHAR_ASTERISK); + } + + if (no_wildsep) + { + if (pattern >= pattern_end) + { + no_slash_z = TRUE; + break; + } + + /* Start check must be after the end check. */ + if (is_start) continue; + } + + if (!is_start) + { + if (after_starstar) + { + out.out_str[0] = CHAR_LEFT_PARENTHESIS; + out.out_str[1] = CHAR_QUESTION_MARK; + out.out_str[2] = CHAR_GREATER_THAN_SIGN; + convert_glob_write_str(&out, 3); + in_atomic = TRUE; + } + else + convert_glob_print_commit(&out); + } + + if (no_wildsep) + convert_glob_write(&out, CHAR_DOT); + else + convert_glob_print_wildcard(&out, separator, with_escape); + + out.out_str[0] = CHAR_ASTERISK; + out.out_str[1] = CHAR_QUESTION_MARK; + if (pattern >= pattern_end) + out.out_str[1] = CHAR_PLUS; + convert_glob_write_str(&out, 2); + continue; + } + + if (c == CHAR_QUESTION_MARK) + { + if (no_wildsep) + convert_glob_write(&out, CHAR_DOT); + else + convert_glob_print_wildcard(&out, separator, with_escape); + continue; + } + + if (c == CHAR_LEFT_SQUARE_BRACKET) + { + result = convert_glob_parse_range(&pattern, pattern_end, + &out, utf, separator, with_escape, escape, no_wildsep); + if (result != 0) break; + continue; + } + + if (escape != 0 && c == escape) + { + if (pattern >= pattern_end) + { + result = PCRE2_ERROR_CONVERT_SYNTAX; + break; + } + c = *pattern++; + } + + if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) + convert_glob_write(&out, CHAR_BACKSLASH); + + convert_glob_write(&out, c); + } + +if (result == 0) + { + if (!no_slash_z) + { + out.out_str[0] = CHAR_BACKSLASH; + out.out_str[1] = CHAR_z; + convert_glob_write_str(&out, 2); + } + + if (in_atomic) + convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); + + convert_glob_write(&out, CHAR_NUL); + + if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer)) + result = PCRE2_ERROR_NOMEMORY; + } + +if (result != 0) + { + *bufflenptr = pattern - pattern_start; + return result; + } + +*bufflenptr = out.output_size - 1; +return 0; +} + + +/************************************************* +* Convert pattern * +*************************************************/ + +/* This is the external-facing function for converting other forms of pattern +into PCRE2 regular expression patterns. On error, the bufflenptr argument is +used to return an offset in the original pattern. + +Arguments: + pattern the input pattern + plength length of input, or PCRE2_ZERO_TERMINATED + options options bits + buffptr pointer to pointer to output buffer + bufflenptr pointer to length of output buffer + ccontext convert context or NULL + +Returns: 0 for success, else an error code (+ve or -ve) +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options, + PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr, + pcre2_convert_context *ccontext) +{ +int i, rc; +PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE]; +PCRE2_UCHAR *use_buffer = dummy_buffer; +PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE; +BOOL utf = (options & PCRE2_CONVERT_UTF) != 0; +uint32_t pattype = options & TYPE_OPTIONS; + +if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL; + +if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */ + (pattype & (~pattype+1)) != pattype || /* More than one type set */ + pattype == 0) /* No type set */ + { + *bufflenptr = 0; /* Error offset */ + return PCRE2_ERROR_BADOPTION; + } + +if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern); +if (ccontext == NULL) ccontext = + (pcre2_convert_context *)(&PRIV(default_convert_context)); + +/* Check UTF if required. */ + +#ifndef SUPPORT_UNICODE +if (utf) + { + *bufflenptr = 0; /* Error offset */ + return PCRE2_ERROR_UNICODE_NOT_SUPPORTED; + } +#else +if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0) + { + PCRE2_SIZE erroroffset; + rc = PRIV(valid_utf)(pattern, plength, &erroroffset); + if (rc != 0) + { + *bufflenptr = erroroffset; + return rc; + } + } +#endif + +/* If buffptr is not NULL, and what it points to is not NULL, we are being +provided with a buffer and a length, so set them as the buffer to use. */ + +if (buffptr != NULL && *buffptr != NULL) + { + use_buffer = *buffptr; + use_length = *bufflenptr; + } + +/* Call an individual converter, either just once (if a buffer was provided or +just the length is needed), or twice (if a memory allocation is required). */ + +for (i = 0; i < 2; i++) + { + PCRE2_UCHAR *allocated; + BOOL dummyrun = buffptr == NULL || *buffptr == NULL; + + switch(pattype) + { + case PCRE2_CONVERT_GLOB: + rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf, + use_buffer, use_length, bufflenptr, dummyrun, ccontext); + break; + + case PCRE2_CONVERT_POSIX_BASIC: + case PCRE2_CONVERT_POSIX_EXTENDED: + rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length, + bufflenptr, dummyrun, ccontext); + break; + + default: + *bufflenptr = 0; /* Error offset */ + return PCRE2_ERROR_INTERNAL; + } + + if (rc != 0 || /* Error */ + buffptr == NULL || /* Just the length is required */ + *buffptr != NULL) /* Buffer was provided or allocated */ + return rc; + + /* Allocate memory for the buffer, with hidden space for an allocator at + the start. The next time round the loop runs the conversion for real. */ + + allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + + (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext); + if (allocated == NULL) return PCRE2_ERROR_NOMEMORY; + *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl)); + + use_buffer = *buffptr; + use_length = *bufflenptr + 1; + } + +/* Control should never get here. */ + +return PCRE2_ERROR_INTERNAL; +} + + +/************************************************* +* Free converted pattern * +*************************************************/ + +/* This frees a converted pattern that was put in newly-allocated memory. + +Argument: the converted pattern +Returns: nothing +*/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_converted_pattern_free(PCRE2_UCHAR *converted) +{ +if (converted != NULL) + { + pcre2_memctl *memctl = + (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl)); + memctl->free(memctl, memctl->memory_data); + } +} + +/* End of pcre2_convert.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_dfa_match.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_dfa_match.c new file mode 100644 index 0000000000..46e45ffa91 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_dfa_match.c @@ -0,0 +1,4066 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre2_dfa_match(), which is an +alternative matching function that uses a sort of DFA algorithm (not a true +FSM). This is NOT Perl-compatible, but it has advantages in certain +applications. */ + + +/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved +the performance of his patterns greatly. I could not use it as it stood, as it +was not thread safe, and made assumptions about pattern sizes. Also, it caused +test 7 to loop, and test 9 to crash with a segfault. + +The issue is the check for duplicate states, which is done by a simple linear +search up the state list. (Grep for "duplicate" below to find the code.) For +many patterns, there will never be many states active at one time, so a simple +linear search is fine. In patterns that have many active states, it might be a +bottleneck. The suggested code used an indexing scheme to remember which states +had previously been used for each character, and avoided the linear search when +it knew there was no chance of a duplicate. This was implemented when adding +states to the state lists. + +I wrote some thread-safe, not-limited code to try something similar at the time +of checking for duplicates (instead of when adding states), using index vectors +on the stack. It did give a 13% improvement with one specially constructed +pattern for certain subject strings, but on other strings and on many of the +simpler patterns in the test suite it did worse. The major problem, I think, +was the extra time to initialize the index. This had to be done for each call +of internal_dfa_match(). (The supplied patch used a static vector, initialized +only once - I suspect this was the cause of the problems with the tests.) + +Overall, I concluded that the gains in some cases did not outweigh the losses +in others, so I abandoned this code. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#define NLBLOCK mb /* Block containing newline information */ +#define PSSTART start_subject /* Field containing processed string start */ +#define PSEND end_subject /* Field containing processed string end */ + +#include "regexp/pcre2/pcre2_internal.h" + +#define PUBLIC_DFA_MATCH_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ + PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ + PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \ + PCRE2_COPY_MATCHED_SUBJECT) + + +/************************************************* +* Code parameters and static tables * +*************************************************/ + +/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes +into others, under special conditions. A gap of 20 between the blocks should be +enough. The resulting opcodes don't have to be less than 256 because they are +never stored, so we push them well clear of the normal opcodes. */ + +#define OP_PROP_EXTRA 300 +#define OP_EXTUNI_EXTRA 320 +#define OP_ANYNL_EXTRA 340 +#define OP_HSPACE_EXTRA 360 +#define OP_VSPACE_EXTRA 380 + + +/* This table identifies those opcodes that are followed immediately by a +character that is to be tested in some way. This makes it possible to +centralize the loading of these characters. In the case of Type * etc, the +"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a +small value. Non-zero values in the table are the offsets from the opcode where +the character is to be found. ***NOTE*** If the start of this table is +modified, the three tables that follow must also be modified. */ + +static const uint8_t coptable[] = { + 0, /* End */ + 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ + 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ + 0, 0, 0, /* Any, AllAny, Anybyte */ + 0, 0, /* \P, \p */ + 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ + 0, /* \X */ + 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ + 1, /* Char */ + 1, /* Chari */ + 1, /* not */ + 1, /* noti */ + /* Positive single-char repeats */ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ + 1+IMM2_SIZE, /* exact */ + 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ + 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ + 1+IMM2_SIZE, /* exact I */ + 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ + /* Negative single-char repeats - only for chars < 256 */ + 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ + 1+IMM2_SIZE, /* NOT exact */ + 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ + 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ + 1+IMM2_SIZE, /* NOT exact I */ + 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ + /* Positive type repeats */ + 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ + 1+IMM2_SIZE, /* Type exact */ + 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ + /* Character class & ref repeats */ + 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ + 0, 0, /* CRRANGE, CRMINRANGE */ + 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ + 0, /* CLASS */ + 0, /* NCLASS */ + 0, /* XCLASS - variable length */ + 0, /* REF */ + 0, /* REFI */ + 0, /* DNREF */ + 0, /* DNREFI */ + 0, /* RECURSE */ + 0, /* CALLOUT */ + 0, /* CALLOUT_STR */ + 0, /* Alt */ + 0, /* Ket */ + 0, /* KetRmax */ + 0, /* KetRmin */ + 0, /* KetRpos */ + 0, /* Reverse */ + 0, /* Assert */ + 0, /* Assert not */ + 0, /* Assert behind */ + 0, /* Assert behind not */ + 0, /* NA assert */ + 0, /* NA assert behind */ + 0, /* ONCE */ + 0, /* SCRIPT_RUN */ + 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ + 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ + 0, 0, /* CREF, DNCREF */ + 0, 0, /* RREF, DNRREF */ + 0, 0, /* FALSE, TRUE */ + 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ + 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ + 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ + 0, 0, /* COMMIT, COMMIT_ARG */ + 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ + 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ +}; + +/* This table identifies those opcodes that inspect a character. It is used to +remember the fact that a character could have been inspected when the end of +the subject is reached. ***NOTE*** If the start of this table is modified, the +two tables that follow must also be modified. */ + +static const uint8_t poptable[] = { + 0, /* End */ + 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ + 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ + 1, 1, 1, /* Any, AllAny, Anybyte */ + 1, 1, /* \P, \p */ + 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ + 1, /* \X */ + 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ + 1, /* Char */ + 1, /* Chari */ + 1, /* not */ + 1, /* noti */ + /* Positive single-char repeats */ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ + 1, 1, 1, /* upto, minupto, exact */ + 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ + 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ + 1, 1, 1, /* upto I, minupto I, exact I */ + 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ + /* Negative single-char repeats - only for chars < 256 */ + 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ + 1, 1, 1, /* NOT upto, minupto, exact */ + 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ + 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ + 1, 1, 1, /* NOT upto I, minupto I, exact I */ + 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ + /* Positive type repeats */ + 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ + 1, 1, 1, /* Type upto, minupto, exact */ + 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ + /* Character class & ref repeats */ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ + 1, 1, /* CRRANGE, CRMINRANGE */ + 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ + 1, /* CLASS */ + 1, /* NCLASS */ + 1, /* XCLASS - variable length */ + 0, /* REF */ + 0, /* REFI */ + 0, /* DNREF */ + 0, /* DNREFI */ + 0, /* RECURSE */ + 0, /* CALLOUT */ + 0, /* CALLOUT_STR */ + 0, /* Alt */ + 0, /* Ket */ + 0, /* KetRmax */ + 0, /* KetRmin */ + 0, /* KetRpos */ + 0, /* Reverse */ + 0, /* Assert */ + 0, /* Assert not */ + 0, /* Assert behind */ + 0, /* Assert behind not */ + 0, /* NA assert */ + 0, /* NA assert behind */ + 0, /* ONCE */ + 0, /* SCRIPT_RUN */ + 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ + 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ + 0, 0, /* CREF, DNCREF */ + 0, 0, /* RREF, DNRREF */ + 0, 0, /* FALSE, TRUE */ + 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ + 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ + 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ + 0, 0, /* COMMIT, COMMIT_ARG */ + 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ + 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ +}; + +/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, +and \w */ + +static const uint8_t toptable1[] = { + 0, 0, 0, 0, 0, 0, + ctype_digit, ctype_digit, + ctype_space, ctype_space, + ctype_word, ctype_word, + 0, 0 /* OP_ANY, OP_ALLANY */ +}; + +static const uint8_t toptable2[] = { + 0, 0, 0, 0, 0, 0, + ctype_digit, 0, + ctype_space, 0, + ctype_word, 0, + 1, 1 /* OP_ANY, OP_ALLANY */ +}; + + +/* Structure for holding data about a particular state, which is in effect the +current data for an active path through the match tree. It must consist +entirely of ints because the working vector we are passed, and which we put +these structures in, is a vector of ints. */ + +typedef struct stateblock { + int offset; /* Offset to opcode (-ve has meaning) */ + int count; /* Count for repeats */ + int data; /* Some use extra data */ +} stateblock; + +#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) + + +/* Before version 10.32 the recursive calls of internal_dfa_match() were passed +local working space and output vectors that were created on the stack. This has +caused issues for some patterns, especially in small-stack environments such as +Windows. A new scheme is now in use which sets up a vector on the stack, but if +this is too small, heap memory is used, up to the heap_limit. The main +parameters are all numbers of ints because the workspace is a vector of ints. + +The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is +defined in pcre2_internal.h so as to be available to pcre2test when it is +finding the minimum heap requirement for a match. */ + +#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int)) + +#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */ +#define RWS_RSIZE 1000 /* Work size for recursion */ +#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */ +#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */ + +/* This structure is at the start of each workspace block. */ + +typedef struct RWS_anchor { + struct RWS_anchor *next; + uint32_t size; /* Number of ints */ + uint32_t free; /* Number of ints */ +} RWS_anchor; + +#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int)) + + + +/************************************************* +* Process a callout * +*************************************************/ + +/* This function is called to perform a callout. + +Arguments: + code current code pointer + offsets points to current capture offsets + current_subject start of current subject match + ptr current position in subject + mb the match block + extracode extra code offset when called from condition + lengthptr where to return the callout length + +Returns: the return from the callout +*/ + +static int +do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject, + PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode, + PCRE2_SIZE *lengthptr) +{ +pcre2_callout_block *cb = mb->cb; + +*lengthptr = (code[extracode] == OP_CALLOUT)? + (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] : + (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode); + +if (mb->callout == NULL) return 0; /* No callout provided */ + +/* Fixed fields in the callout block are set once and for all at the start of +matching. */ + +cb->offset_vector = offsets; +cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject); +cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject); +cb->pattern_position = GET(code, 1 + extracode); +cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode); + +if (code[extracode] == OP_CALLOUT) + { + cb->callout_number = code[1 + 2*LINK_SIZE + extracode]; + cb->callout_string_offset = 0; + cb->callout_string = NULL; + cb->callout_string_length = 0; + } +else + { + cb->callout_number = 0; + cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode); + cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1; + cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2; + } + +return (mb->callout)(cb, mb->callout_data); +} + + + +/************************************************* +* Expand local workspace memory * +*************************************************/ + +/* This function is called when internal_dfa_match() is about to be called +recursively and there is insufficient working space left in the current +workspace block. If there's an existing next block, use it; otherwise get a new +block unless the heap limit is reached. + +Arguments: + rwsptr pointer to block pointer (updated) + ovecsize space needed for an ovector + mb the match block + +Returns: 0 rwsptr has been updated + !0 an error code +*/ + +static int +more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb) +{ +RWS_anchor *rws = *rwsptr; +RWS_anchor *new; + +if (rws->next != NULL) + { + new = rws->next; + } + +/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but +mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid +overflow. */ + +else + { + uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2; + uint32_t newsizeK = newsize/(1024/sizeof(int)); + + if (newsizeK + mb->heap_used > mb->heap_limit) + newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used); + newsize = newsizeK*(1024/sizeof(int)); + + if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE) + return PCRE2_ERROR_HEAPLIMIT; + new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data); + if (new == NULL) return PCRE2_ERROR_NOMEMORY; + mb->heap_used += newsizeK; + new->next = NULL; + new->size = newsize; + rws->next = new; + } + +new->free = new->size - RWS_ANCHOR_SIZE; +*rwsptr = new; +return 0; +} + + + +/************************************************* +* Match a Regular Expression - DFA engine * +*************************************************/ + +/* This internal function applies a compiled pattern to a subject string, +starting at a given point, using a DFA engine. This function is called from the +external one, possibly multiple times if the pattern is not anchored. The +function calls itself recursively for some kinds of subpattern. + +Arguments: + mb the match_data block with fixed information + this_start_code the opening bracket of this subexpression's code + current_subject where we currently are in the subject string + start_offset start offset in the subject string + offsets vector to contain the matching string offsets + offsetcount size of same + workspace vector of workspace + wscount size of same + rlevel function call recursion level + +Returns: > 0 => number of match offset pairs placed in offsets + = 0 => offsets overflowed; longest matches are present + -1 => failed to match + < -1 => some kind of unexpected problem + +The following macros are used for adding states to the two state vectors (one +for the current character, one for the following character). */ + +#define ADD_ACTIVE(x,y) \ + if (active_count++ < wscount) \ + { \ + next_active_state->offset = (x); \ + next_active_state->count = (y); \ + next_active_state++; \ + } \ + else return PCRE2_ERROR_DFA_WSSIZE + +#define ADD_ACTIVE_DATA(x,y,z) \ + if (active_count++ < wscount) \ + { \ + next_active_state->offset = (x); \ + next_active_state->count = (y); \ + next_active_state->data = (z); \ + next_active_state++; \ + } \ + else return PCRE2_ERROR_DFA_WSSIZE + +#define ADD_NEW(x,y) \ + if (new_count++ < wscount) \ + { \ + next_new_state->offset = (x); \ + next_new_state->count = (y); \ + next_new_state++; \ + } \ + else return PCRE2_ERROR_DFA_WSSIZE + +#define ADD_NEW_DATA(x,y,z) \ + if (new_count++ < wscount) \ + { \ + next_new_state->offset = (x); \ + next_new_state->count = (y); \ + next_new_state->data = (z); \ + next_new_state++; \ + } \ + else return PCRE2_ERROR_DFA_WSSIZE + +/* And now, here is the code */ + +static int +internal_dfa_match( + dfa_match_block *mb, + PCRE2_SPTR this_start_code, + PCRE2_SPTR current_subject, + PCRE2_SIZE start_offset, + PCRE2_SIZE *offsets, + uint32_t offsetcount, + int *workspace, + int wscount, + uint32_t rlevel, + int *RWS) +{ +stateblock *active_states, *new_states, *temp_states; +stateblock *next_active_state, *next_new_state; +const uint8_t *ctypes, *lcc, *fcc; +PCRE2_SPTR ptr; +PCRE2_SPTR end_code; +dfa_recursion_info new_recursive; +int active_count, new_count, match_count; + +/* Some fields in the mb block are frequently referenced, so we load them into +independent variables in the hope that this will perform better. */ + +PCRE2_SPTR start_subject = mb->start_subject; +PCRE2_SPTR end_subject = mb->end_subject; +PCRE2_SPTR start_code = mb->start_code; + +#ifdef SUPPORT_UNICODE +BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0; +#else +BOOL utf = FALSE; +#endif + +BOOL reset_could_continue = FALSE; + +if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; +if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; +offsetcount &= (uint32_t)(-2); /* Round down */ + +wscount -= 2; +wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / + (2 * INTS_PER_STATEBLOCK); + +ctypes = mb->tables + ctypes_offset; +lcc = mb->tables + lcc_offset; +fcc = mb->tables + fcc_offset; + +match_count = PCRE2_ERROR_NOMATCH; /* A negative number */ + +active_states = (stateblock *)(workspace + 2); +next_new_state = new_states = active_states + wscount; +new_count = 0; + +/* The first thing in any (sub) pattern is a bracket of some sort. Push all +the alternative states onto the list, and find out where the end is. This +makes is possible to use this function recursively, when we want to stop at a +matching internal ket rather than at the end. + +If we are dealing with a backward assertion we have to find out the maximum +amount to move back, and set up each alternative appropriately. */ + +if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT) + { + size_t max_back = 0; + size_t gone_back; + + end_code = this_start_code; + do + { + size_t back = (size_t)GET(end_code, 2+LINK_SIZE); + if (back > max_back) max_back = back; + end_code += GET(end_code, 1); + } + while (*end_code == OP_ALT); + + /* If we can't go back the amount required for the longest lookbehind + pattern, go back as far as we can; some alternatives may still be viable. */ + +#ifdef SUPPORT_UNICODE + /* In character mode we have to step back character by character */ + + if (utf) + { + for (gone_back = 0; gone_back < max_back; gone_back++) + { + if (current_subject <= start_subject) break; + current_subject--; + ACROSSCHAR(current_subject > start_subject, current_subject, + current_subject--); + } + } + else +#endif + + /* In byte-mode we can do this quickly. */ + + { + size_t current_offset = (size_t)(current_subject - start_subject); + gone_back = (current_offset < max_back)? current_offset : max_back; + current_subject -= gone_back; + } + + /* Save the earliest consulted character */ + + if (current_subject < mb->start_used_ptr) + mb->start_used_ptr = current_subject; + + /* Now we can process the individual branches. There will be an OP_REVERSE at + the start of each branch, except when the length of the branch is zero. */ + + end_code = this_start_code; + do + { + uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0; + size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE); + if (back <= gone_back) + { + int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen); + ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back)); + } + end_code += GET(end_code, 1); + } + while (*end_code == OP_ALT); + } + +/* This is the code for a "normal" subpattern (not a backward assertion). The +start of a whole pattern is always one of these. If we are at the top level, +we may be asked to restart matching from the same point that we reached for a +previous partial match. We still have to scan through the top-level branches to +find the end state. */ + +else + { + end_code = this_start_code; + + /* Restarting */ + + if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0) + { + do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); + new_count = workspace[1]; + if (!workspace[0]) + memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock)); + } + + /* Not restarting */ + + else + { + int length = 1 + LINK_SIZE + + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || + *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) + ? IMM2_SIZE:0); + do + { + ADD_NEW((int)(end_code - start_code + length), 0); + end_code += GET(end_code, 1); + length = 1 + LINK_SIZE; + } + while (*end_code == OP_ALT); + } + } + +workspace[0] = 0; /* Bit indicating which vector is current */ + +/* Loop for scanning the subject */ + +ptr = current_subject; +for (;;) + { + int i, j; + int clen, dlen; + uint32_t c, d; + int forced_fail = 0; + BOOL partial_newline = FALSE; + BOOL could_continue = reset_could_continue; + reset_could_continue = FALSE; + + if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr; + + /* Make the new state list into the active state list and empty the + new state list. */ + + temp_states = active_states; + active_states = new_states; + new_states = temp_states; + active_count = new_count; + new_count = 0; + + workspace[0] ^= 1; /* Remember for the restarting feature */ + workspace[1] = active_count; + + /* Set the pointers for adding new states */ + + next_active_state = active_states + active_count; + next_new_state = new_states; + + /* Load the current character from the subject outside the loop, as many + different states may want to look at it, and we assume that at least one + will. */ + + if (ptr < end_subject) + { + clen = 1; /* Number of data items in the character */ +#ifdef SUPPORT_UNICODE + GETCHARLENTEST(c, ptr, clen); +#else + c = *ptr; +#endif /* SUPPORT_UNICODE */ + } + else + { + clen = 0; /* This indicates the end of the subject */ + c = NOTACHAR; /* This value should never actually be used */ + } + + /* Scan up the active states and act on each one. The result of an action + may be to add more states to the currently active list (e.g. on hitting a + parenthesis) or it may be to put states on the new list, for considering + when we move the character pointer on. */ + + for (i = 0; i < active_count; i++) + { + stateblock *current_state = active_states + i; + BOOL caseless = FALSE; + PCRE2_SPTR code; + uint32_t codevalue; + int state_offset = current_state->offset; + int rrc; + int count; + + /* A negative offset is a special case meaning "hold off going to this + (negated) state until the number of characters in the data field have + been skipped". If the could_continue flag was passed over from a previous + state, arrange for it to passed on. */ + + if (state_offset < 0) + { + if (current_state->data > 0) + { + ADD_NEW_DATA(state_offset, current_state->count, + current_state->data - 1); + if (could_continue) reset_could_continue = TRUE; + continue; + } + else + { + current_state->offset = state_offset = -state_offset; + } + } + + /* Check for a duplicate state with the same count, and skip if found. + See the note at the head of this module about the possibility of improving + performance here. */ + + for (j = 0; j < i; j++) + { + if (active_states[j].offset == state_offset && + active_states[j].count == current_state->count) + goto NEXT_ACTIVE_STATE; + } + + /* The state offset is the offset to the opcode */ + + code = start_code + state_offset; + codevalue = *code; + + /* If this opcode inspects a character, but we are at the end of the + subject, remember the fact for use when testing for a partial match. */ + + if (clen == 0 && poptable[codevalue] != 0) + could_continue = TRUE; + + /* If this opcode is followed by an inline character, load it. It is + tempting to test for the presence of a subject character here, but that + is wrong, because sometimes zero repetitions of the subject are + permitted. + + We also use this mechanism for opcodes such as OP_TYPEPLUS that take an + argument that is not a data character - but is always one byte long because + the values are small. We have to take special action to deal with \P, \p, + \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert + these ones to new opcodes. */ + + if (coptable[codevalue] > 0) + { + dlen = 1; +#ifdef SUPPORT_UNICODE + if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else +#endif /* SUPPORT_UNICODE */ + d = code[coptable[codevalue]]; + if (codevalue >= OP_TYPESTAR) + { + switch(d) + { + case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM; + case OP_NOTPROP: + case OP_PROP: codevalue += OP_PROP_EXTRA; break; + case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; + case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; + case OP_NOT_HSPACE: + case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; + case OP_NOT_VSPACE: + case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; + default: break; + } + } + } + else + { + dlen = 0; /* Not strictly necessary, but compilers moan */ + d = NOTACHAR; /* if these variables are not set. */ + } + + + /* Now process the individual opcodes */ + + switch (codevalue) + { +/* ========================================================================== */ + /* These cases are never obeyed. This is a fudge that causes a compile- + time error if the vectors coptable or poptable, which are indexed by + opcode, are not the correct length. It seems to be the only way to do + such a check at compile time, as the sizeof() operator does not work + in the C preprocessor. */ + + case OP_TABLE_LENGTH: + case OP_TABLE_LENGTH + + ((sizeof(coptable) == OP_TABLE_LENGTH) && + (sizeof(poptable) == OP_TABLE_LENGTH)): + return 0; + +/* ========================================================================== */ + /* Reached a closing bracket. If not at the end of the pattern, carry + on with the next opcode. For repeating opcodes, also add the repeat + state. Note that KETRPOS will always be encountered at the end of the + subpattern, because the possessive subpattern repeats are always handled + using recursive calls. Thus, it never adds any new states. + + At the end of the (sub)pattern, unless we have an empty string and + PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the + start of the subject, save the match data, shifting up all previous + matches so we always have the longest first. */ + + case OP_KET: + case OP_KETRMIN: + case OP_KETRMAX: + case OP_KETRPOS: + if (code != end_code) + { + ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); + if (codevalue != OP_KET) + { + ADD_ACTIVE(state_offset - (int)GET(code, 1), 0); + } + } + else + { + if (ptr > current_subject || + ((mb->moptions & PCRE2_NOTEMPTY) == 0 && + ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 || + current_subject > start_subject + mb->start_offset))) + { + if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; + else if (match_count > 0 && ++match_count * 2 > (int)offsetcount) + match_count = 0; + count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2; + if (count > 0) (void)memmove(offsets + 2, offsets, + (size_t)count * sizeof(PCRE2_SIZE)); + if (offsetcount >= 2) + { + offsets[0] = (PCRE2_SIZE)(current_subject - start_subject); + offsets[1] = (PCRE2_SIZE)(ptr - start_subject); + } + if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count; + } + } + break; + +/* ========================================================================== */ + /* These opcodes add to the current list of states without looking + at the current character. */ + + /*-----------------------------------------------------------------*/ + case OP_ALT: + do { code += GET(code, 1); } while (*code == OP_ALT); + ADD_ACTIVE((int)(code - start_code), 0); + break; + + /*-----------------------------------------------------------------*/ + case OP_BRA: + case OP_SBRA: + do + { + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); + code += GET(code, 1); + } + while (*code == OP_ALT); + break; + + /*-----------------------------------------------------------------*/ + case OP_CBRA: + case OP_SCBRA: + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); + code += GET(code, 1); + while (*code == OP_ALT) + { + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); + code += GET(code, 1); + } + break; + + /*-----------------------------------------------------------------*/ + case OP_BRAZERO: + case OP_BRAMINZERO: + ADD_ACTIVE(state_offset + 1, 0); + code += 1 + GET(code, 2); + while (*code == OP_ALT) code += GET(code, 1); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); + break; + + /*-----------------------------------------------------------------*/ + case OP_SKIPZERO: + code += 1 + GET(code, 2); + while (*code == OP_ALT) code += GET(code, 1); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); + break; + + /*-----------------------------------------------------------------*/ + case OP_CIRC: + if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) + { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_CIRCM: + if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) || + ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 ) + && WAS_NEWLINE(ptr))) + { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_EOD: + if (ptr >= end_subject) + { + if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) + return PCRE2_ERROR_PARTIAL; + else { ADD_ACTIVE(state_offset + 1, 0); } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_SOD: + if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_SOM: + if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } + break; + + +/* ========================================================================== */ + /* These opcodes inspect the next subject character, and sometimes + the previous one as well, but do not have an argument. The variable + clen contains the length of the current character and is zero if we are + at the end of the subject. */ + + /*-----------------------------------------------------------------*/ + case OP_ANY: + if (clen > 0 && !IS_NEWLINE(ptr)) + { + if (ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else + { + ADD_NEW(state_offset + 1, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_ALLANY: + if (clen > 0) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_EODN: + if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen)) + { + if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) + return PCRE2_ERROR_PARTIAL; + ADD_ACTIVE(state_offset + 1, 0); + } + break; + + /*-----------------------------------------------------------------*/ + case OP_DOLL: + if ((mb->moptions & PCRE2_NOTEOL) == 0) + { + if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) + could_continue = TRUE; + else if (clen == 0 || + ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && + (ptr == end_subject - mb->nllen) + )) + { ADD_ACTIVE(state_offset + 1, 0); } + else if (ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) + { + reset_could_continue = TRUE; + ADD_NEW_DATA(-(state_offset + 1), 0, 1); + } + else could_continue = partial_newline = TRUE; + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_DOLLM: + if ((mb->moptions & PCRE2_NOTEOL) == 0) + { + if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) + could_continue = TRUE; + else if (clen == 0 || + ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) + { ADD_ACTIVE(state_offset + 1, 0); } + else if (ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) + { + reset_could_continue = TRUE; + ADD_NEW_DATA(-(state_offset + 1), 0, 1); + } + else could_continue = partial_newline = TRUE; + } + } + else if (IS_NEWLINE(ptr)) + { ADD_ACTIVE(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + + case OP_DIGIT: + case OP_WHITESPACE: + case OP_WORDCHAR: + if (clen > 0 && c < 256 && + ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_NOT_DIGIT: + case OP_NOT_WHITESPACE: + case OP_NOT_WORDCHAR: + if (clen > 0 && (c >= 256 || + ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + { + int left_word, right_word; + + if (ptr > start_subject) + { + PCRE2_SPTR temp = ptr - 1; + if (temp < mb->start_used_ptr) mb->start_used_ptr = temp; +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (utf) { BACKCHAR(temp); } +#endif + GETCHARTEST(d, temp); +#ifdef SUPPORT_UNICODE + if ((mb->poptions & PCRE2_UCP) != 0) + { + if (d == '_') left_word = TRUE; else + { + uint32_t cat = UCD_CATEGORY(d); + left_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif + left_word = d < 256 && (ctypes[d] & ctype_word) != 0; + } + else left_word = FALSE; + + if (clen > 0) + { + if (ptr >= mb->last_used_ptr) + { + PCRE2_SPTR temp = ptr + 1; +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (utf) { FORWARDCHARTEST(temp, mb->end_subject); } +#endif + mb->last_used_ptr = temp; + } +#ifdef SUPPORT_UNICODE + if ((mb->poptions & PCRE2_UCP) != 0) + { + if (c == '_') right_word = TRUE; else + { + uint32_t cat = UCD_CATEGORY(c); + right_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif + right_word = c < 256 && (ctypes[c] & ctype_word) != 0; + } + else right_word = FALSE; + + if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) + { ADD_ACTIVE(state_offset + 1, 0); } + } + break; + + + /*-----------------------------------------------------------------*/ + /* Check the next character by Unicode property. We will get here only + if the support is in the binary; otherwise a compile-time error occurs. + */ + +#ifdef SUPPORT_UNICODE + case OP_PROP: + case OP_NOTPROP: + if (clen > 0) + { + BOOL OK; + const uint32_t *cp; + const ucd_record * prop = GET_UCD(c); + switch(code[1]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; + break; + + case PT_GC: + OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; + break; + + case PT_PC: + OK = prop->chartype == code[2]; + break; + + case PT_SC: + OK = prop->script == code[2]; + break; + + case PT_SCX: + OK = (prop->script == code[2] || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0); + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } + break; + + case PT_WORD: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + + case PT_BIDICL: + OK = UCD_BIDICLASS(c) == code[2]; + break; + + case PT_BOOL: + OK = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), code[2]) != 0; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } + } + break; +#endif + + + +/* ========================================================================== */ + /* These opcodes likewise inspect the subject character, but have an + argument that is not a data character. It is one of these opcodes: + OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, + OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + if (d == OP_ANY && ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || !IS_NEWLINE(ptr)) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (count > 0 && codevalue == OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + if (d == OP_ANY && ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || !IS_NEWLINE(ptr)) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (codevalue == OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset + 2, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + if (d == OP_ANY && ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || !IS_NEWLINE(ptr)) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (codevalue == OP_TYPEPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPEEXACT: + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + if (d == OP_ANY && ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || !IS_NEWLINE(ptr)) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + if (d == OP_ANY && ptr + 1 >= mb->end_subject && + (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + c == NLBLOCK->nl[0]) + { + could_continue = partial_newline = TRUE; + } + else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || + (c < 256 && + (d != OP_ANY || !IS_NEWLINE(ptr)) && + ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) + { + if (codevalue == OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + +/* ========================================================================== */ + /* These are virtual opcodes that are used when something like + OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its + argument. It keeps the code above fast for the other cases. The argument + is in the d variable. */ + +#ifdef SUPPORT_UNICODE + case OP_PROP_EXTRA + OP_TYPEPLUS: + case OP_PROP_EXTRA + OP_TYPEMINPLUS: + case OP_PROP_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } + if (clen > 0) + { + BOOL OK; + const uint32_t *cp; + const ucd_record * prop = GET_UCD(c); + switch(code[2]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; + break; + + case PT_GC: + OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; + break; + + case PT_PC: + OK = prop->chartype == code[3]; + break; + + case PT_SC: + OK = prop->script == code[3]; + break; + + case PT_SCX: + OK = (prop->script == code[3] || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0); + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } + break; + + case PT_WORD: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + + case PT_BIDICL: + OK = UCD_BIDICLASS(c) == code[3]; + break; + + case PT_BOOL: + OK = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), code[3]) != 0; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) + { + if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXTUNI_EXTRA + OP_TYPEPLUS: + case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: + case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + int ncount = 0; + if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, + &ncount); + count++; + ADD_NEW_DATA(-state_offset, count, ncount); + } + break; +#endif + + /*-----------------------------------------------------------------*/ + case OP_ANYNL_EXTRA + OP_TYPEPLUS: + case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: + case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + int ncount = 0; + switch (c) + { + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; + goto ANYNL01; + + case CHAR_CR: + if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; + /* Fall through */ + + ANYNL01: + case CHAR_LF: + if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW_DATA(-state_offset, count, ncount); + break; + + default: + break; + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_VSPACE_EXTRA + OP_TYPEPLUS: + case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: + case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + BOOL OK; + switch (c) + { + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = FALSE; + break; + } + + if (OK == (d == OP_VSPACE)) + { + if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW_DATA(-state_offset, count, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_HSPACE_EXTRA + OP_TYPEPLUS: + case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: + case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } + if (clen > 0) + { + BOOL OK; + switch (c) + { + HSPACE_CASES: + OK = TRUE; + break; + + default: + OK = FALSE; + break; + } + + if (OK == (d == OP_HSPACE)) + { + if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW_DATA(-state_offset, count, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ +#ifdef SUPPORT_UNICODE + case OP_PROP_EXTRA + OP_TYPEQUERY: + case OP_PROP_EXTRA + OP_TYPEMINQUERY: + case OP_PROP_EXTRA + OP_TYPEPOSQUERY: + count = 4; + goto QS1; + + case OP_PROP_EXTRA + OP_TYPESTAR: + case OP_PROP_EXTRA + OP_TYPEMINSTAR: + case OP_PROP_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS1: + + ADD_ACTIVE(state_offset + 4, 0); + if (clen > 0) + { + BOOL OK; + const uint32_t *cp; + const ucd_record * prop = GET_UCD(c); + switch(code[2]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; + break; + + case PT_GC: + OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; + break; + + case PT_PC: + OK = prop->chartype == code[3]; + break; + + case PT_SC: + OK = prop->script == code[3]; + break; + + case PT_SCX: + OK = (prop->script == code[3] || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0); + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } + break; + + case PT_WORD: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[3]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + + case PT_BIDICL: + OK = UCD_BIDICLASS(c) == code[3]; + break; + + case PT_BOOL: + OK = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), code[3]) != 0; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) + { + if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset + count, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXTUNI_EXTRA + OP_TYPEQUERY: + case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: + case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: + count = 2; + goto QS2; + + case OP_EXTUNI_EXTRA + OP_TYPESTAR: + case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: + case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS2: + + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + int ncount = 0; + if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, + &ncount); + ADD_NEW_DATA(-(state_offset + count), 0, ncount); + } + break; +#endif + + /*-----------------------------------------------------------------*/ + case OP_ANYNL_EXTRA + OP_TYPEQUERY: + case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: + case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: + count = 2; + goto QS3; + + case OP_ANYNL_EXTRA + OP_TYPESTAR: + case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: + case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS3: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + int ncount = 0; + switch (c) + { + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; + goto ANYNL02; + + case CHAR_CR: + if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; + /* Fall through */ + + ANYNL02: + case CHAR_LF: + if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); + break; + + default: + break; + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_VSPACE_EXTRA + OP_TYPEQUERY: + case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: + case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: + count = 2; + goto QS4; + + case OP_VSPACE_EXTRA + OP_TYPESTAR: + case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: + case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS4: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + BOOL OK; + switch (c) + { + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = FALSE; + break; + } + if (OK == (d == OP_VSPACE)) + { + if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_HSPACE_EXTRA + OP_TYPEQUERY: + case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: + case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: + count = 2; + goto QS5; + + case OP_HSPACE_EXTRA + OP_TYPESTAR: + case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: + case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: + count = 0; + + QS5: + ADD_ACTIVE(state_offset + 2, 0); + if (clen > 0) + { + BOOL OK; + switch (c) + { + HSPACE_CASES: + OK = TRUE; + break; + + default: + OK = FALSE; + break; + } + + if (OK == (d == OP_HSPACE)) + { + if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || + codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ +#ifdef SUPPORT_UNICODE + case OP_PROP_EXTRA + OP_TYPEEXACT: + case OP_PROP_EXTRA + OP_TYPEUPTO: + case OP_PROP_EXTRA + OP_TYPEMINUPTO: + case OP_PROP_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + BOOL OK; + const uint32_t *cp; + const ucd_record * prop = GET_UCD(c); + switch(code[1 + IMM2_SIZE + 1]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt; + break; + + case PT_GC: + OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; + break; + + case PT_PC: + OK = prop->chartype == code[1 + IMM2_SIZE + 2]; + break; + + case PT_SC: + OK = prop->script == code[1 + IMM2_SIZE + 2]; + break; + + case PT_SCX: + OK = (prop->script == code[1 + IMM2_SIZE + 2] || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), + code[1 + IMM2_SIZE + 2]) != 0); + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N; + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; + break; + } + break; + + case PT_WORD: + OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; + for (;;) + { + if (c < *cp) { OK = FALSE; break; } + if (c == *cp++) { OK = TRUE; break; } + } + break; + + case PT_UCNC: + OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000; + break; + + case PT_BIDICL: + OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2]; + break; + + case PT_BOOL: + OK = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) + { + if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXTUNI_EXTRA + OP_TYPEEXACT: + case OP_EXTUNI_EXTRA + OP_TYPEUPTO: + case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: + case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + PCRE2_SPTR nptr; + int ncount = 0; + if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, + &ncount); + if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) + reset_could_continue = TRUE; + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } + else + { ADD_NEW_DATA(-state_offset, count, ncount); } + } + break; +#endif + + /*-----------------------------------------------------------------*/ + case OP_ANYNL_EXTRA + OP_TYPEEXACT: + case OP_ANYNL_EXTRA + OP_TYPEUPTO: + case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: + case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + int ncount = 0; + switch (c) + { + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; + goto ANYNL03; + + case CHAR_CR: + if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; + /* Fall through */ + + ANYNL03: + case CHAR_LF: + if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } + else + { ADD_NEW_DATA(-state_offset, count, ncount); } + break; + + default: + break; + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_VSPACE_EXTRA + OP_TYPEEXACT: + case OP_VSPACE_EXTRA + OP_TYPEUPTO: + case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: + case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + BOOL OK; + switch (c) + { + VSPACE_CASES: + OK = TRUE; + break; + + default: + OK = FALSE; + } + + if (OK == (d == OP_VSPACE)) + { + if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } + else + { ADD_NEW_DATA(-state_offset, count, 0); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_HSPACE_EXTRA + OP_TYPEEXACT: + case OP_HSPACE_EXTRA + OP_TYPEUPTO: + case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: + case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: + if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) + { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + BOOL OK; + switch (c) + { + HSPACE_CASES: + OK = TRUE; + break; + + default: + OK = FALSE; + break; + } + + if (OK == (d == OP_HSPACE)) + { + if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } + else + { ADD_NEW_DATA(-state_offset, count, 0); } + } + } + break; + +/* ========================================================================== */ + /* These opcodes are followed by a character that is usually compared + to the current subject character; it is loaded into d. We still get + here even if there is no subject character, because in some cases zero + repetitions are permitted. */ + + /*-----------------------------------------------------------------*/ + case OP_CHAR: + if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + case OP_CHARI: + if (clen == 0) break; + +#ifdef SUPPORT_UNICODE + if (utf_or_ucp) + { + if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else + { + unsigned int othercase; + if (c < 128) + othercase = fcc[c]; + else + othercase = UCD_OTHERCASE(c); + if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } + } + } + else +#endif /* SUPPORT_UNICODE */ + /* Not UTF or UCP mode */ + { + if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) + { ADD_NEW(state_offset + 2, 0); } + } + break; + + +#ifdef SUPPORT_UNICODE + /*-----------------------------------------------------------------*/ + /* This is a tricky one because it can match more than one character. + Find out how many characters to skip, and then set up a negative state + to wait for them to pass before continuing. */ + + case OP_EXTUNI: + if (clen > 0) + { + int ncount = 0; + PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, + end_subject, utf, &ncount); + if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) + reset_could_continue = TRUE; + ADD_NEW_DATA(-(state_offset + 1), 0, ncount); + } + break; +#endif + + /*-----------------------------------------------------------------*/ + /* This is a tricky like EXTUNI because it too can match more than one + character (when CR is followed by LF). In this case, set up a negative + state to wait for one character to pass before continuing. */ + + case OP_ANYNL: + if (clen > 0) switch(c) + { + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; + /* Fall through */ + + case CHAR_LF: + ADD_NEW(state_offset + 1, 0); + break; + + case CHAR_CR: + if (ptr + 1 >= end_subject) + { + ADD_NEW(state_offset + 1, 0); + if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) + reset_could_continue = TRUE; + } + else if (UCHAR21TEST(ptr + 1) == CHAR_LF) + { + ADD_NEW_DATA(-(state_offset + 1), 0, 1); + } + else + { + ADD_NEW(state_offset + 1, 0); + } + break; + } + break; + + /*-----------------------------------------------------------------*/ + case OP_NOT_VSPACE: + if (clen > 0) switch(c) + { + VSPACE_CASES: + break; + + default: + ADD_NEW(state_offset + 1, 0); + break; + } + break; + + /*-----------------------------------------------------------------*/ + case OP_VSPACE: + if (clen > 0) switch(c) + { + VSPACE_CASES: + ADD_NEW(state_offset + 1, 0); + break; + + default: + break; + } + break; + + /*-----------------------------------------------------------------*/ + case OP_NOT_HSPACE: + if (clen > 0) switch(c) + { + HSPACE_CASES: + break; + + default: + ADD_NEW(state_offset + 1, 0); + break; + } + break; + + /*-----------------------------------------------------------------*/ + case OP_HSPACE: + if (clen > 0) switch(c) + { + HSPACE_CASES: + ADD_NEW(state_offset + 1, 0); + break; + + default: + break; + } + break; + + /*-----------------------------------------------------------------*/ + /* Match a negated single character casefully. */ + + case OP_NOT: + if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ + /* Match a negated single character caselessly. */ + + case OP_NOTI: + if (clen > 0) + { + uint32_t otherd; +#ifdef SUPPORT_UNICODE + if (utf_or_ucp && d >= 128) + otherd = UCD_OTHERCASE(d); + else +#endif /* SUPPORT_UNICODE */ + otherd = TABLE_GET(d, fcc, d); + if (c != d && c != otherd) + { ADD_NEW(state_offset + dlen + 1, 0); } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_PLUSI: + case OP_MINPLUSI: + case OP_POSPLUSI: + case OP_NOTPLUSI: + case OP_NOTMINPLUSI: + case OP_NOTPOSPLUSI: + caseless = TRUE; + codevalue -= OP_STARI - OP_STAR; + + /* Fall through */ + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } + if (clen > 0) + { + uint32_t otherd = NOTACHAR; + if (caseless) + { +#ifdef SUPPORT_UNICODE + if (utf_or_ucp && d >= 128) + otherd = UCD_OTHERCASE(d); + else +#endif /* SUPPORT_UNICODE */ + otherd = TABLE_GET(d, fcc, d); + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (count > 0 && + (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_QUERYI: + case OP_MINQUERYI: + case OP_POSQUERYI: + case OP_NOTQUERYI: + case OP_NOTMINQUERYI: + case OP_NOTPOSQUERYI: + caseless = TRUE; + codevalue -= OP_STARI - OP_STAR; + /* Fall through */ + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + case OP_NOTQUERY: + case OP_NOTMINQUERY: + case OP_NOTPOSQUERY: + ADD_ACTIVE(state_offset + dlen + 1, 0); + if (clen > 0) + { + uint32_t otherd = NOTACHAR; + if (caseless) + { +#ifdef SUPPORT_UNICODE + if (utf_or_ucp && d >= 128) + otherd = UCD_OTHERCASE(d); + else +#endif /* SUPPORT_UNICODE */ + otherd = TABLE_GET(d, fcc, d); + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset + dlen + 1, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_STARI: + case OP_MINSTARI: + case OP_POSSTARI: + case OP_NOTSTARI: + case OP_NOTMINSTARI: + case OP_NOTPOSSTARI: + caseless = TRUE; + codevalue -= OP_STARI - OP_STAR; + /* Fall through */ + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPOSSTAR: + ADD_ACTIVE(state_offset + dlen + 1, 0); + if (clen > 0) + { + uint32_t otherd = NOTACHAR; + if (caseless) + { +#ifdef SUPPORT_UNICODE + if (utf_or_ucp && d >= 128) + otherd = UCD_OTHERCASE(d); + else +#endif /* SUPPORT_UNICODE */ + otherd = TABLE_GET(d, fcc, d); + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_EXACTI: + case OP_NOTEXACTI: + caseless = TRUE; + codevalue -= OP_STARI - OP_STAR; + /* Fall through */ + case OP_EXACT: + case OP_NOTEXACT: + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + uint32_t otherd = NOTACHAR; + if (caseless) + { +#ifdef SUPPORT_UNICODE + if (utf_or_ucp && d >= 128) + otherd = UCD_OTHERCASE(d); + else +#endif /* SUPPORT_UNICODE */ + otherd = TABLE_GET(d, fcc, d); + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_UPTOI: + case OP_MINUPTOI: + case OP_POSUPTOI: + case OP_NOTUPTOI: + case OP_NOTMINUPTOI: + case OP_NOTPOSUPTOI: + caseless = TRUE; + codevalue -= OP_STARI - OP_STAR; + /* Fall through */ + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + case OP_NOTUPTO: + case OP_NOTMINUPTO: + case OP_NOTPOSUPTO: + ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); + count = current_state->count; /* Number already matched */ + if (clen > 0) + { + uint32_t otherd = NOTACHAR; + if (caseless) + { +#ifdef SUPPORT_UNICODE + if (utf_or_ucp && d >= 128) + otherd = UCD_OTHERCASE(d); + else +#endif /* SUPPORT_UNICODE */ + otherd = TABLE_GET(d, fcc, d); + } + if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) + { + if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + if (++count >= (int)GET2(code, 1)) + { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } + else + { ADD_NEW(state_offset, count); } + } + } + break; + + +/* ========================================================================== */ + /* These are the class-handling opcodes */ + + case OP_CLASS: + case OP_NCLASS: + case OP_XCLASS: + { + BOOL isinclass = FALSE; + int next_state_offset; + PCRE2_SPTR ecode; + + /* For a simple class, there is always just a 32-byte table, and we + can set isinclass from it. */ + + if (codevalue != OP_XCLASS) + { + ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); + if (clen > 0) + { + isinclass = (c > 255)? (codevalue == OP_NCLASS) : + ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0); + } + } + + /* An extended class may have a table or a list of single characters, + ranges, or both, and it may be positive or negative. There's a + function that sorts all this out. */ + + else + { + ecode = code + GET(code, 1); + if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); + } + + /* At this point, isinclass is set for all kinds of class, and ecode + points to the byte after the end of the class. If there is a + quantifier, this is where it will be. */ + + next_state_offset = (int)(ecode - start_code); + + switch (*ecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPOSSTAR: + ADD_ACTIVE(next_state_offset + 1, 0); + if (isinclass) + { + if (*ecode == OP_CRPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } + break; + + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSPLUS: + count = current_state->count; /* Already matched */ + if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } + if (isinclass) + { + if (count > 0 && *ecode == OP_CRPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } + break; + + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSQUERY: + ADD_ACTIVE(next_state_offset + 1, 0); + if (isinclass) + { + if (*ecode == OP_CRPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(next_state_offset + 1, 0); + } + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + count = current_state->count; /* Already matched */ + if (count >= (int)GET2(ecode, 1)) + { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } + if (isinclass) + { + int max = (int)GET2(ecode, 1 + IMM2_SIZE); + + if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1)) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + + if (++count >= max && max != 0) /* Max 0 => no limit */ + { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } + else + { ADD_NEW(state_offset, count); } + } + break; + + default: + if (isinclass) { ADD_NEW(next_state_offset, 0); } + break; + } + } + break; + +/* ========================================================================== */ + /* These are the opcodes for fancy brackets of various kinds. We have + to use recursion in order to handle them. The "always failing" assertion + (?!) is optimised to OP_FAIL when compiling, so we have to support that, + though the other "backtracking verbs" are not supported. */ + + case OP_FAIL: + forced_fail++; /* Count FAILs for multiple states */ + break; + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + { + int rc; + int *local_workspace; + PCRE2_SIZE *local_offsets; + PCRE2_SPTR endasscode = code + GET(code, 1); + RWS_anchor *rws = (RWS_anchor *)RWS; + + if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) + { + rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); + if (rc != 0) return rc; + RWS = (int *)rws; + } + + local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); + local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; + rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; + + while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); + + rc = internal_dfa_match( + mb, /* static match data */ + code, /* this subexpression's code */ + ptr, /* where we currently are */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ + local_offsets, /* offset vector */ + RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ + local_workspace, /* workspace vector */ + RWS_RSIZE, /* size of same */ + rlevel, /* function recursion level */ + RWS); /* recursion workspace */ + + rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; + + if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; + if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) + { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_COND: + case OP_SCOND: + { + int codelink = (int)GET(code, 1); + PCRE2_UCHAR condcode; + + /* Because of the way auto-callout works during compile, a callout item + is inserted between OP_COND and an assertion condition. This does not + happen for the other conditions. */ + + if (code[LINK_SIZE + 1] == OP_CALLOUT + || code[LINK_SIZE + 1] == OP_CALLOUT_STR) + { + PCRE2_SIZE callout_length; + rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, + 1 + LINK_SIZE, &callout_length); + if (rrc < 0) return rrc; /* Abandon */ + if (rrc > 0) break; /* Fail this thread */ + code += callout_length; /* Skip callout data */ + } + + condcode = code[LINK_SIZE+1]; + + /* Back reference conditions and duplicate named recursion conditions + are not supported */ + + if (condcode == OP_CREF || condcode == OP_DNCREF || + condcode == OP_DNRREF) + return PCRE2_ERROR_DFA_UCOND; + + /* The DEFINE condition is always false, and the assertion (?!) is + converted to OP_FAIL. */ + + if (condcode == OP_FALSE || condcode == OP_FAIL) + { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } + + /* There is also an always-true condition */ + + else if (condcode == OP_TRUE) + { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); } + + /* The only supported version of OP_RREF is for the value RREF_ANY, + which means "test if in any recursion". We can't test for specifically + recursed groups. */ + + else if (condcode == OP_RREF) + { + unsigned int value = GET2(code, LINK_SIZE + 2); + if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND; + if (mb->recursive != NULL) + { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } + else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } + } + + /* Otherwise, the condition is an assertion */ + + else + { + int rc; + int *local_workspace; + PCRE2_SIZE *local_offsets; + PCRE2_SPTR asscode = code + LINK_SIZE + 1; + PCRE2_SPTR endasscode = asscode + GET(asscode, 1); + RWS_anchor *rws = (RWS_anchor *)RWS; + + if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) + { + rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); + if (rc != 0) return rc; + RWS = (int *)rws; + } + + local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); + local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; + rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; + + while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); + + rc = internal_dfa_match( + mb, /* fixed match data */ + asscode, /* this subexpression's code */ + ptr, /* where we currently are */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ + local_offsets, /* offset vector */ + RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ + local_workspace, /* workspace vector */ + RWS_RSIZE, /* size of same */ + rlevel, /* function recursion level */ + RWS); /* recursion workspace */ + + rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; + + if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; + if ((rc >= 0) == + (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) + { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } + else + { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_RECURSE: + { + int rc; + int *local_workspace; + PCRE2_SIZE *local_offsets; + RWS_anchor *rws = (RWS_anchor *)RWS; + dfa_recursion_info *ri; + PCRE2_SPTR callpat = start_code + GET(code, 1); + uint32_t recno = (callpat == mb->start_code)? 0 : + GET2(callpat, 1 + LINK_SIZE); + + if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE) + { + rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb); + if (rc != 0) return rc; + RWS = (int *)rws; + } + + local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); + local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE; + rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE; + + /* Check for repeating a recursion without advancing the subject + pointer. This should catch convoluted mutual recursions. (Some simple + cases are caught at compile time.) */ + + for (ri = mb->recursive; ri != NULL; ri = ri->prevrec) + if (recno == ri->group_num && ptr == ri->subject_position) + return PCRE2_ERROR_RECURSELOOP; + + /* Remember this recursion and where we started it so as to + catch infinite loops. */ + + new_recursive.group_num = recno; + new_recursive.subject_position = ptr; + new_recursive.prevrec = mb->recursive; + mb->recursive = &new_recursive; + + rc = internal_dfa_match( + mb, /* fixed match data */ + callpat, /* this subexpression's code */ + ptr, /* where we currently are */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ + local_offsets, /* offset vector */ + RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */ + local_workspace, /* workspace vector */ + RWS_RSIZE, /* size of same */ + rlevel, /* function recursion level */ + RWS); /* recursion workspace */ + + rws->free += RWS_RSIZE + RWS_OVEC_RSIZE; + mb->recursive = new_recursive.prevrec; /* Done this recursion */ + + /* Ran out of internal offsets */ + + if (rc == 0) return PCRE2_ERROR_DFA_RECURSE; + + /* For each successful matched substring, set up the next state with a + count of characters to skip before trying it. Note that the count is in + characters, not bytes. */ + + if (rc > 0) + { + for (rc = rc*2 - 2; rc >= 0; rc -= 2) + { + PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc]; +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (utf) + { + PCRE2_SPTR p = start_subject + local_offsets[rc]; + PCRE2_SPTR pp = start_subject + local_offsets[rc+1]; + while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; + } +#endif + if (charcount > 0) + { + ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, + (int)(charcount - 1)); + } + else + { + ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); + } + } + } + else if (rc != PCRE2_ERROR_NOMATCH) return rc; + } + break; + + /*-----------------------------------------------------------------*/ + case OP_BRAPOS: + case OP_SBRAPOS: + case OP_CBRAPOS: + case OP_SCBRAPOS: + case OP_BRAPOSZERO: + { + int rc; + int *local_workspace; + PCRE2_SIZE *local_offsets; + PCRE2_SIZE charcount, matched_count; + PCRE2_SPTR local_ptr = ptr; + RWS_anchor *rws = (RWS_anchor *)RWS; + BOOL allow_zero; + + if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) + { + rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); + if (rc != 0) return rc; + RWS = (int *)rws; + } + + local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); + local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; + rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; + + if (codevalue == OP_BRAPOSZERO) + { + allow_zero = TRUE; + codevalue = *(++code); /* Codevalue will be one of above BRAs */ + } + else allow_zero = FALSE; + + /* Loop to match the subpattern as many times as possible as if it were + a complete pattern. */ + + for (matched_count = 0;; matched_count++) + { + rc = internal_dfa_match( + mb, /* fixed match data */ + code, /* this subexpression's code */ + local_ptr, /* where we currently are */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ + local_offsets, /* offset vector */ + RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ + local_workspace, /* workspace vector */ + RWS_RSIZE, /* size of same */ + rlevel, /* function recursion level */ + RWS); /* recursion workspace */ + + /* Failed to match */ + + if (rc < 0) + { + if (rc != PCRE2_ERROR_NOMATCH) return rc; + break; + } + + /* Matched: break the loop if zero characters matched. */ + + charcount = local_offsets[1] - local_offsets[0]; + if (charcount == 0) break; + local_ptr += charcount; /* Advance temporary position ptr */ + } + + rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; + + /* At this point we have matched the subpattern matched_count + times, and local_ptr is pointing to the character after the end of the + last match. */ + + if (matched_count > 0 || allow_zero) + { + PCRE2_SPTR end_subpattern = code; + int next_state_offset; + + do { end_subpattern += GET(end_subpattern, 1); } + while (*end_subpattern == OP_ALT); + next_state_offset = + (int)(end_subpattern - start_code + LINK_SIZE + 1); + + /* Optimization: if there are no more active states, and there + are no new states yet set up, then skip over the subject string + right here, to save looping. Otherwise, set up the new state to swing + into action when the end of the matched substring is reached. */ + + if (i + 1 >= active_count && new_count == 0) + { + ptr = local_ptr; + clen = 0; + ADD_NEW(next_state_offset, 0); + } + else + { + PCRE2_SPTR p = ptr; + PCRE2_SPTR pp = local_ptr; + charcount = (PCRE2_SIZE)(pp - p); +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; +#endif + ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); + } + } + } + break; + + /*-----------------------------------------------------------------*/ + case OP_ONCE: + { + int rc; + int *local_workspace; + PCRE2_SIZE *local_offsets; + RWS_anchor *rws = (RWS_anchor *)RWS; + + if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) + { + rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); + if (rc != 0) return rc; + RWS = (int *)rws; + } + + local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); + local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; + rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; + + rc = internal_dfa_match( + mb, /* fixed match data */ + code, /* this subexpression's code */ + ptr, /* where we currently are */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ + local_offsets, /* offset vector */ + RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ + local_workspace, /* workspace vector */ + RWS_RSIZE, /* size of same */ + rlevel, /* function recursion level */ + RWS); /* recursion workspace */ + + rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; + + if (rc >= 0) + { + PCRE2_SPTR end_subpattern = code; + PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0]; + int next_state_offset, repeat_state_offset; + + do { end_subpattern += GET(end_subpattern, 1); } + while (*end_subpattern == OP_ALT); + next_state_offset = + (int)(end_subpattern - start_code + LINK_SIZE + 1); + + /* If the end of this subpattern is KETRMAX or KETRMIN, we must + arrange for the repeat state also to be added to the relevant list. + Calculate the offset, or set -1 for no repeat. */ + + repeat_state_offset = (*end_subpattern == OP_KETRMAX || + *end_subpattern == OP_KETRMIN)? + (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; + + /* If we have matched an empty string, add the next state at the + current character pointer. This is important so that the duplicate + checking kicks in, which is what breaks infinite loops that match an + empty string. */ + + if (charcount == 0) + { + ADD_ACTIVE(next_state_offset, 0); + } + + /* Optimization: if there are no more active states, and there + are no new states yet set up, then skip over the subject string + right here, to save looping. Otherwise, set up the new state to swing + into action when the end of the matched substring is reached. */ + + else if (i + 1 >= active_count && new_count == 0) + { + ptr += charcount; + clen = 0; + ADD_NEW(next_state_offset, 0); + + /* If we are adding a repeat state at the new character position, + we must fudge things so that it is the only current state. + Otherwise, it might be a duplicate of one we processed before, and + that would cause it to be skipped. */ + + if (repeat_state_offset >= 0) + { + next_active_state = active_states; + active_count = 0; + i = -1; + ADD_ACTIVE(repeat_state_offset, 0); + } + } + else + { +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (utf) + { + PCRE2_SPTR p = start_subject + local_offsets[0]; + PCRE2_SPTR pp = start_subject + local_offsets[1]; + while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; + } +#endif + ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); + if (repeat_state_offset >= 0) + { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); } + } + } + else if (rc != PCRE2_ERROR_NOMATCH) return rc; + } + break; + + +/* ========================================================================== */ + /* Handle callouts */ + + case OP_CALLOUT: + case OP_CALLOUT_STR: + { + PCRE2_SIZE callout_length; + rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0, + &callout_length); + if (rrc < 0) return rrc; /* Abandon */ + if (rrc == 0) + { ADD_ACTIVE(state_offset + (int)callout_length, 0); } + } + break; + + +/* ========================================================================== */ + default: /* Unsupported opcode */ + return PCRE2_ERROR_DFA_UITEM; + } + + NEXT_ACTIVE_STATE: continue; + + } /* End of loop scanning active states */ + + /* We have finished the processing at the current subject character. If no + new states have been set for the next character, we have found all the + matches that we are going to find. If partial matching has been requested, + check for appropriate conditions. + + The "forced_ fail" variable counts the number of (*F) encountered for the + character. If it is equal to the original active_count (saved in + workspace[1]) it means that (*F) was found on every active state. In this + case we don't want to give a partial match. + + The "could_continue" variable is true if a state could have continued but + for the fact that the end of the subject was reached. */ + + if (new_count <= 0) + { + if (could_continue && /* Some could go on, and */ + forced_fail != workspace[1] && /* Not all forced fail & */ + ( /* either... */ + (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */ + || /* or... */ + ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */ + match_count < 0) /* no matches */ + ) && /* And... */ + ( + partial_newline || /* Either partial NL */ + ( /* or ... */ + ptr >= end_subject && /* End of subject and */ + ( /* either */ + ptr > mb->start_used_ptr || /* Inspected non-empty string */ + mb->allowemptypartial /* or pattern has lookbehind */ + ) /* or could match empty */ + ) + )) + match_count = PCRE2_ERROR_PARTIAL; + break; /* Exit from loop along the subject string */ + } + + /* One or more states are active for the next character. */ + + ptr += clen; /* Advance to next subject character */ + } /* Loop to move along the subject string */ + +/* Control gets here from "break" a few lines above. If we have a match and +PCRE2_ENDANCHORED is set, the match fails. */ + +if (match_count >= 0 && + ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 && + ptr < end_subject) + match_count = PCRE2_ERROR_NOMATCH; + +return match_count; +} + + + +/************************************************* +* Match a pattern using the DFA algorithm * +*************************************************/ + +/* This function matches a compiled pattern to a subject string, using the +alternate matching algorithm that finds all matches at once. + +Arguments: + code points to the compiled pattern + subject subject string + length length of subject string + startoffset where to start matching in the subject + options option bits + match_data points to a match data structure + gcontext points to a match context + workspace pointer to workspace + wscount size of workspace + +Returns: > 0 => number of match offset pairs placed in offsets + = 0 => offsets overflowed; longest matches are present + -1 => failed to match + < -1 => some kind of unexpected problem +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, + PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount) +{ +int rc; +int was_zero_terminated = 0; + +const pcre2_real_code *re = (const pcre2_real_code *)code; + +PCRE2_SPTR start_match; +PCRE2_SPTR end_subject; +PCRE2_SPTR bumpalong_limit; +PCRE2_SPTR req_cu_ptr; + +BOOL utf, anchored, startline, firstline; +BOOL has_first_cu = FALSE; +BOOL has_req_cu = FALSE; + +#if PCRE2_CODE_UNIT_WIDTH == 8 +PCRE2_SPTR memchr_found_first_cu = NULL; +PCRE2_SPTR memchr_found_first_cu2 = NULL; +#endif + +PCRE2_UCHAR first_cu = 0; +PCRE2_UCHAR first_cu2 = 0; +PCRE2_UCHAR req_cu = 0; +PCRE2_UCHAR req_cu2 = 0; + +const uint8_t *start_bits = NULL; + +/* We need to have mb pointing to a match block, because the IS_NEWLINE macro +is used below, and it expects NLBLOCK to be defined as a pointer. */ + +pcre2_callout_block cb; +dfa_match_block actual_match_block; +dfa_match_block *mb = &actual_match_block; + +/* Set up a starting block of memory for use during recursive calls to +internal_dfa_match(). By putting this on the stack, it minimizes resource use +in the case when it is not needed. If this is too small, more memory is +obtained from the heap. At the start of each block is an anchor structure.*/ + +int base_recursion_workspace[RWS_BASE_SIZE]; +RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace; +rws->next = NULL; +rws->size = RWS_BASE_SIZE; +rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE; + +/* Recognize NULL, length 0 as an empty string. */ + +if (subject == NULL && length == 0) subject = (PCRE2_SPTR)""; + +/* Plausibility checks */ + +if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; +if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL) + return PCRE2_ERROR_NULL; + +if (length == PCRE2_ZERO_TERMINATED) + { + length = PRIV(strlen)(subject); + was_zero_terminated = 1; + } + +if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE; +if (start_offset > length) return PCRE2_ERROR_BADOFFSET; + +/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same +time. */ + +if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && + ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) + return PCRE2_ERROR_BADOPTION; + +/* Invalid UTF support is not available for DFA matching. */ + +if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0) + return PCRE2_ERROR_DFA_UINVALID_UTF; + +/* Check that the first field in the block is the magic number. If it is not, +return with PCRE2_ERROR_BADMAGIC. */ + +if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; + +/* Check the code unit width. */ + +if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) + return PCRE2_ERROR_BADMODE; + +/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the +options variable for this function. Users of PCRE2 who are not calling the +function directly would like to have a way of setting these flags, in the same +way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with +constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and +(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be +transferred to the options for this function. The bits are guaranteed to be +adjacent, but do not have the same values. This bit of Boolean trickery assumes +that the match-time bits are not more significant than the flag bits. If by +accident this is not the case, a compile-time division by zero error will +occur. */ + +#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) +#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) +options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); +#undef FF +#undef OO + +/* If restarting after a partial match, do some sanity checks on the contents +of the workspace. */ + +if ((options & PCRE2_DFA_RESTART) != 0) + { + if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || + workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK)) + return PCRE2_ERROR_DFA_BADRESTART; + } + +/* Set some local values */ + +utf = (re->overall_options & PCRE2_UTF) != 0; +start_match = subject + start_offset; +end_subject = subject + length; +req_cu_ptr = start_match - 1; +anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 || + (re->overall_options & PCRE2_ANCHORED) != 0; + +/* The "must be at the start of a line" flags are used in a loop when finding +where to start. */ + +startline = (re->flags & PCRE2_STARTLINE) != 0; +firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; +bumpalong_limit = end_subject; + +/* Initialize and set up the fixed fields in the callout block, with a pointer +in the match block. */ + +mb->cb = &cb; +cb.version = 2; +cb.subject = subject; +cb.subject_length = (PCRE2_SIZE)(end_subject - subject); +cb.callout_flags = 0; +cb.capture_top = 1; /* No capture support */ +cb.capture_last = 0; +cb.mark = NULL; /* No (*MARK) support */ + +/* Get data from the match context, if present, and fill in the remaining +fields in the match block. It is an error to set an offset limit without +setting the flag at compile time. */ + +if (mcontext == NULL) + { + mb->callout = NULL; + mb->memctl = re->memctl; + mb->match_limit = PRIV(default_match_context).match_limit; + mb->match_limit_depth = PRIV(default_match_context).depth_limit; + mb->heap_limit = PRIV(default_match_context).heap_limit; + } +else + { + if (mcontext->offset_limit != PCRE2_UNSET) + { + if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) + return PCRE2_ERROR_BADOFFSETLIMIT; + bumpalong_limit = subject + mcontext->offset_limit; + } + mb->callout = mcontext->callout; + mb->callout_data = mcontext->callout_data; + mb->memctl = mcontext->memctl; + mb->match_limit = mcontext->match_limit; + mb->match_limit_depth = mcontext->depth_limit; + mb->heap_limit = mcontext->heap_limit; + } + +if (mb->match_limit > re->limit_match) + mb->match_limit = re->limit_match; + +if (mb->match_limit_depth > re->limit_depth) + mb->match_limit_depth = re->limit_depth; + +if (mb->heap_limit > re->limit_heap) + mb->heap_limit = re->limit_heap; + +mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + + re->name_count * re->name_entry_size; +mb->tables = re->tables; +mb->start_subject = subject; +mb->end_subject = end_subject; +mb->start_offset = start_offset; +mb->allowemptypartial = (re->max_lookbehind > 0) || + (re->flags & PCRE2_MATCH_EMPTY) != 0; +mb->moptions = options; +mb->poptions = re->overall_options; +mb->match_call_count = 0; +mb->heap_used = 0; + +/* Process the \R and newline settings. */ + +mb->bsr_convention = re->bsr_convention; +mb->nltype = NLTYPE_FIXED; +switch(re->newline_convention) + { + case PCRE2_NEWLINE_CR: + mb->nllen = 1; + mb->nl[0] = CHAR_CR; + break; + + case PCRE2_NEWLINE_LF: + mb->nllen = 1; + mb->nl[0] = CHAR_NL; + break; + + case PCRE2_NEWLINE_NUL: + mb->nllen = 1; + mb->nl[0] = CHAR_NUL; + break; + + case PCRE2_NEWLINE_CRLF: + mb->nllen = 2; + mb->nl[0] = CHAR_CR; + mb->nl[1] = CHAR_NL; + break; + + case PCRE2_NEWLINE_ANY: + mb->nltype = NLTYPE_ANY; + break; + + case PCRE2_NEWLINE_ANYCRLF: + mb->nltype = NLTYPE_ANYCRLF; + break; + + default: return PCRE2_ERROR_INTERNAL; + } + +/* Check a UTF string for validity if required. For 8-bit and 16-bit strings, +we must also check that a starting offset does not point into the middle of a +multiunit character. We check only the portion of the subject that is going to +be inspected during matching - from the offset minus the maximum back reference +to the given length. This saves time when a small part of a large subject is +being matched by the use of a starting offset. Note that the maximum lookbehind +is a number of characters, not code units. */ + +#ifdef SUPPORT_UNICODE +if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) + { + PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ + + if (start_offset > 0) + { +#if PCRE2_CODE_UNIT_WIDTH != 32 + unsigned int i; + if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + return PCRE2_ERROR_BADUTFOFFSET; + for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) + { + check_subject--; + while (check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*check_subject & 0xfc00) == 0xdc00) +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + check_subject--; + } +#else /* In the 32-bit library, one code unit equals one character. */ + check_subject -= re->max_lookbehind; + if (check_subject < subject) check_subject = subject; +#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + } + + /* Validate the relevant portion of the subject. After an error, adjust the + offset to be an absolute offset in the whole string. */ + + match_data->rc = PRIV(valid_utf)(check_subject, + length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += (PCRE2_SIZE)(check_subject - subject); + return match_data->rc; + } + } +#endif /* SUPPORT_UNICODE */ + +/* Set up the first code unit to match, if available. If there's no first code +unit there may be a bitmap of possible first characters. */ + +if ((re->flags & PCRE2_FIRSTSET) != 0) + { + has_first_cu = TRUE; + first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); + if ((re->flags & PCRE2_FIRSTCASELESS) != 0) + { + first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) + first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); +#else + if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) + first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); +#endif +#endif /* SUPPORT_UNICODE */ + } + } +else + if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) + start_bits = re->start_bitmap; + +/* There may be a "last known required code unit" set. */ + +if ((re->flags & PCRE2_LASTSET) != 0) + { + has_req_cu = TRUE; + req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); + if ((re->flags & PCRE2_LASTCASELESS) != 0) + { + req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) + req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) + req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); +#endif +#endif /* SUPPORT_UNICODE */ + } + } + +/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, +free the memory that was obtained. */ + +if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) + { + match_data->memctl.free((void *)match_data->subject, + match_data->memctl.memory_data); + match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; + } + +/* Fill in fields that are always returned in the match data. */ + +match_data->code = re; +match_data->subject = NULL; /* Default for no match */ +match_data->mark = NULL; +match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER; + +/* Call the main matching function, looping for a non-anchored regex after a +failed match. If not restarting, perform certain optimizations at the start of +a match. */ + +for (;;) + { + /* ----------------- Start of match optimizations ---------------- */ + + /* There are some optimizations that avoid running the match if a known + starting point is not found, or if a known later code unit is not present. + However, there is an option (settable at compile time) that disables + these, for testing and for ensuring that all callouts do actually occur. + The optimizations must also be avoided when restarting a DFA match. */ + + if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && + (options & PCRE2_DFA_RESTART) == 0) + { + /* If firstline is TRUE, the start of the match is constrained to the first + line of a multiline string. That is, the match must be before or at the + first newline following the start of matching. Temporarily adjust + end_subject so that we stop the optimization scans for a first code unit + immediately after the first character of a newline (the first code unit can + legitimately be a newline). If the match fails at the newline, later code + breaks this loop. */ + + if (firstline) + { + PCRE2_SPTR t = start_match; +#ifdef SUPPORT_UNICODE + if (utf) + { + while (t < end_subject && !IS_NEWLINE(t)) + { + t++; + ACROSSCHAR(t < end_subject, t, t++); + } + } + else +#endif + while (t < end_subject && !IS_NEWLINE(t)) t++; + end_subject = t; + } + + /* Anchored: check the first code unit if one is recorded. This may seem + pointless but it can help in detecting a no match case without scanning for + the required code unit. */ + + if (anchored) + { + if (has_first_cu || start_bits != NULL) + { + BOOL ok = start_match < end_subject; + if (ok) + { + PCRE2_UCHAR c = UCHAR21TEST(start_match); + ok = has_first_cu && (c == first_cu || c == first_cu2); + if (!ok && start_bits != NULL) + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + ok = (start_bits[c/8] & (1u << (c&7))) != 0; + } + } + if (!ok) break; + } + } + + /* Not anchored. Advance to a unique first code unit if there is one. */ + + else + { + if (has_first_cu) + { + if (first_cu != first_cu2) /* Caseless */ + { + /* In 16-bit and 32_bit modes we have to do our own search, so can + look for both cases at once. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + PCRE2_UCHAR smc; + while (start_match < end_subject && + (smc = UCHAR21TEST(start_match)) != first_cu && + smc != first_cu2) + start_match++; +#else + /* In 8-bit mode, the use of memchr() gives a big speed up, even + though we have to call it twice in order to find the earliest + occurrence of the code unit in either of its cases. Caching is used + to remember the positions of previously found code units. This can + make a huge difference when the strings are very long and only one + case is actually present. */ + + PCRE2_SPTR pp1 = NULL; + PCRE2_SPTR pp2 = NULL; + PCRE2_SIZE searchlength = end_subject - start_match; + + /* If we haven't got a previously found position for first_cu, or if + the current starting position is later, we need to do a search. If + the code unit is not found, set it to the end. */ + + if (memchr_found_first_cu == NULL || + start_match > memchr_found_first_cu) + { + pp1 = memchr(start_match, first_cu, searchlength); + memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; + } + + /* If the start is before a previously found position, use the + previous position, or NULL if a previous search failed. */ + + else pp1 = (memchr_found_first_cu == end_subject)? NULL : + memchr_found_first_cu; + + /* Do the same thing for the other case. */ + + if (memchr_found_first_cu2 == NULL || + start_match > memchr_found_first_cu2) + { + pp2 = memchr(start_match, first_cu2, searchlength); + memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; + } + + else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : + memchr_found_first_cu2; + + /* Set the start to the end of the subject if neither case was found. + Otherwise, use the earlier found point. */ + + if (pp1 == NULL) + start_match = (pp2 == NULL)? end_subject : pp2; + else + start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; + +#endif /* 8-bit handling */ + } + + /* The caseful case is much simpler. */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (start_match < end_subject && UCHAR21TEST(start_match) != + first_cu) + start_match++; +#else /* 8-bit code units */ + start_match = memchr(start_match, first_cu, end_subject - start_match); + if (start_match == NULL) start_match = end_subject; +#endif + } + + /* If we can't find the required code unit, having reached the true end + of the subject, break the bumpalong loop, to force a match failure, + except when doing partial matching, when we let the next cycle run at + the end of the subject. To see why, consider the pattern /(?<=abc)def/, + which partially matches "abc", even though the string does not contain + the starting character "d". If we have not reached the true end of the + subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) + we also let the cycle run, because the matching string is legitimately + allowed to start with the first code unit of a newline. */ + + if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && + start_match >= mb->end_subject) + break; + } + + /* If there's no first code unit, advance to just after a linebreak for a + multiline match if required. */ + + else if (startline) + { + if (start_match > mb->start_subject + start_offset) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + ACROSSCHAR(start_match < end_subject, start_match, start_match++); + } + } + else +#endif + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; + + /* If we have just passed a CR and the newline option is ANY or + ANYCRLF, and we are now at a LF, advance the match position by one + more code unit. */ + + if (start_match[-1] == CHAR_CR && + (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + UCHAR21TEST(start_match) == CHAR_NL) + start_match++; + } + } + + /* If there's no first code unit or a requirement for a multiline line + start, advance to a non-unique first code unit if any have been + identified. The bitmap contains only 256 bits. When code units are 16 or + 32 bits wide, all code units greater than 254 set the 255 bit. */ + + else if (start_bits != NULL) + { + while (start_match < end_subject) + { + uint32_t c = UCHAR21TEST(start_match); +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + if ((start_bits[c/8] & (1u << (c&7))) != 0) break; + start_match++; + } + + /* See comment above in first_cu checking about the next line. */ + + if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && + start_match >= mb->end_subject) + break; + } + } /* End of first code unit handling */ + + /* Restore fudged end_subject */ + + end_subject = mb->end_subject; + + /* The following two optimizations are disabled for partial matching. */ + + if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0) + { + PCRE2_SPTR p; + + /* The minimum matching length is a lower bound; no actual string of that + length may actually match the pattern. Although the value is, strictly, + in characters, we treat it as code units to avoid spending too much time + in this optimization. */ + + if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT; + + /* If req_cu is set, we know that that code unit must appear in the + subject for the match to succeed. If the first code unit is set, req_cu + must be later in the subject; otherwise the test starts at the match + point. This optimization can save a huge amount of backtracking in + patterns with nested unlimited repeats that aren't going to match. + Writing separate code for cased/caseless versions makes it go faster, as + does using an autoincrement and backing off on a match. As in the case of + the first code unit, using memchr() in the 8-bit library gives a big + speed up. Unlike the first_cu check above, we do not need to call + memchr() twice in the caseless case because we only need to check for the + presence of the character in either case, not find the first occurrence. + + The search can be skipped if the code unit was found later than the + current starting point in a previous iteration of the bumpalong loop. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary + patterns. This showed up when somebody was matching something like + /^\d+C/ on a 32-megabyte string... so we don't do this when the string is + sufficiently long, but it's worth searching a lot more for unanchored + patterns. */ + + p = start_match + (has_first_cu? 1:0); + if (has_req_cu && p > req_cu_ptr) + { + PCRE2_SIZE check_length = end_subject - start_match; + + if (check_length < REQ_CU_MAX || + (!anchored && check_length < REQ_CU_MAX * 1000)) + { + if (req_cu != req_cu2) /* Caseless */ + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (p < end_subject) + { + uint32_t pp = UCHAR21INCTEST(p); + if (pp == req_cu || pp == req_cu2) { p--; break; } + } +#else /* 8-bit code units */ + PCRE2_SPTR pp = p; + p = memchr(pp, req_cu, end_subject - pp); + if (p == NULL) + { + p = memchr(pp, req_cu2, end_subject - pp); + if (p == NULL) p = end_subject; + } +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ + } + + /* The caseful case */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (p < end_subject) + { + if (UCHAR21INCTEST(p) == req_cu) { p--; break; } + } + +#else /* 8-bit code units */ + p = memchr(p, req_cu, end_subject - p); + if (p == NULL) p = end_subject; +#endif + } + + /* If we can't find the required code unit, break the matching loop, + forcing a match failure. */ + + if (p >= end_subject) break; + + /* If we have found the required code unit, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this code unit yet. */ + + req_cu_ptr = p; + } + } + } + } + + /* ------------ End of start of match optimizations ------------ */ + + /* Give no match if we have passed the bumpalong limit. */ + + if (start_match > bumpalong_limit) break; + + /* OK, now we can do the business */ + + mb->start_used_ptr = start_match; + mb->last_used_ptr = start_match; + mb->recursive = NULL; + + rc = internal_dfa_match( + mb, /* fixed match data */ + mb->start_code, /* this subexpression's code */ + start_match, /* where we currently are */ + start_offset, /* start offset in subject */ + match_data->ovector, /* offset vector */ + (uint32_t)match_data->oveccount * 2, /* actual size of same */ + workspace, /* workspace vector */ + (int)wscount, /* size of same */ + 0, /* function recurse level */ + base_recursion_workspace); /* initial workspace for recursion */ + + /* Anything other than "no match" means we are done, always; otherwise, carry + on only if not anchored. */ + + if (rc != PCRE2_ERROR_NOMATCH || anchored) + { + if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0) + { + match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject); + match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject); + } + match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject); + match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject); + match_data->startchar = (PCRE2_SIZE)(start_match - subject); + match_data->rc = rc; + + if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0) + { + length = CU2BYTES(length + was_zero_terminated); + match_data->subject = match_data->memctl.malloc(length, + match_data->memctl.memory_data); + if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy((void *)match_data->subject, subject, length); + match_data->flags |= PCRE2_MD_COPIED_SUBJECT; + } + else + { + if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject; + } + goto EXIT; + } + + /* Advance to the next subject character unless we are at the end of a line + and firstline is set. */ + + if (firstline && IS_NEWLINE(start_match)) break; + start_match++; +#ifdef SUPPORT_UNICODE + if (utf) + { + ACROSSCHAR(start_match < end_subject, start_match, start_match++); + } +#endif + if (start_match > end_subject) break; + + /* If we have just passed a CR and we are now at a LF, and the pattern does + not contain any explicit matches for \r or \n, and the newline option is CRLF + or ANY or ANYCRLF, advance the match position by one more character. */ + + if (UCHAR21TEST(start_match - 1) == CHAR_CR && + start_match < end_subject && + UCHAR21TEST(start_match) == CHAR_NL && + (re->flags & PCRE2_HASCRORLF) == 0 && + (mb->nltype == NLTYPE_ANY || + mb->nltype == NLTYPE_ANYCRLF || + mb->nllen == 2)) + start_match++; + + } /* "Bumpalong" loop */ + +NOMATCH_EXIT: +rc = PCRE2_ERROR_NOMATCH; + +EXIT: +while (rws->next != NULL) + { + RWS_anchor *next = rws->next; + rws->next = next->next; + mb->memctl.free(next, mb->memctl.memory_data); + } + +return rc; +} + +/* These #undefs are here to enable unity builds with CMake. */ + +#undef NLBLOCK /* Block containing newline information */ +#undef PSSTART /* Field containing processed string start */ +#undef PSEND /* Field containing processed string end */ + +/* End of pcre2_dfa_match.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_error.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_error.c new file mode 100644 index 0000000000..b8afc12755 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_error.c @@ -0,0 +1,341 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + +#define STRING(a) # a +#define XSTRING(s) STRING(s) + +/* The texts of compile-time error messages. Compile-time error numbers start +at COMPILE_ERROR_BASE (100). + +This used to be a table of strings, but in order to reduce the number of +relocations needed when a shared library is loaded dynamically, it is now one +long string. We cannot use a table of offsets, because the lengths of inserts +such as XSTRING(MAX_NAME_SIZE) are not known. Instead, +pcre2_get_error_message() counts through to the one it wants - this isn't a +performance issue because these strings are used only when there is an error. + +Each substring ends with \0 to insert a null character. This includes the final +substring, so that the whole string ends with \0\0, which can be detected when +counting through. */ + +static const unsigned char compile_error_texts[] = + "no error\0" + "\\ at end of pattern\0" + "\\c at end of pattern\0" + "unrecognized character follows \\\0" + "numbers out of order in {} quantifier\0" + /* 5 */ + "number too big in {} quantifier\0" + "missing terminating ] for character class\0" + "escape sequence is invalid in character class\0" + "range out of order in character class\0" + "quantifier does not follow a repeatable item\0" + /* 10 */ + "internal error: unexpected repeat\0" + "unrecognized character after (? or (?-\0" + "POSIX named classes are supported only within a class\0" + "POSIX collating elements are not supported\0" + "missing closing parenthesis\0" + /* 15 */ + "reference to non-existent subpattern\0" + "pattern passed as NULL\0" + "unrecognised compile-time option bit(s)\0" + "missing ) after (?# comment\0" + "parentheses are too deeply nested\0" + /* 20 */ + "regular expression is too large\0" + "failed to allocate heap memory\0" + "unmatched closing parenthesis\0" + "internal error: code overflow\0" + "missing closing parenthesis for condition\0" + /* 25 */ + "lookbehind assertion is not fixed length\0" + "a relative value of zero is not allowed\0" + "conditional subpattern contains more than two branches\0" + "assertion expected after (?( or (?(?C)\0" + "digit expected after (?+ or (?-\0" + /* 30 */ + "unknown POSIX class name\0" + "internal error in pcre2_study(): should not occur\0" + "this version of PCRE2 does not have Unicode support\0" + "parentheses are too deeply nested (stack check)\0" + "character code point value in \\x{} or \\o{} is too large\0" + /* 35 */ + "lookbehind is too complicated\0" + "\\C is not allowed in a lookbehind assertion in UTF-" XSTRING(PCRE2_CODE_UNIT_WIDTH) " mode\0" + "PCRE2 does not support \\F, \\L, \\l, \\N{name}, \\U, or \\u\0" + "number after (?C is greater than 255\0" + "closing parenthesis for (?C expected\0" + /* 40 */ + "invalid escape sequence in (*VERB) name\0" + "unrecognized character after (?P\0" + "syntax error in subpattern name (missing terminator?)\0" + "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0" + "subpattern name must start with a non-digit\0" + /* 45 */ + "this version of PCRE2 does not have support for \\P, \\p, or \\X\0" + "malformed \\P or \\p sequence\0" + "unknown property after \\P or \\p\0" + "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0" + "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" + /* 50 */ + "invalid range in character class\0" + "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0" + "internal error: overran compiling workspace\0" + "internal error: previously-checked referenced subpattern not found\0" + "DEFINE subpattern contains more than one branch\0" + /* 55 */ + "missing opening brace after \\o\0" + "internal error: unknown newline setting\0" + "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" + "(?R (recursive pattern call) must be followed by a closing parenthesis\0" + /* "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" */ + "obsolete error (should not occur)\0" /* Was the above */ + /* 60 */ + "(*VERB) not recognized or malformed\0" + "subpattern number is too big\0" + "subpattern name expected\0" + "internal error: parsed pattern overflow\0" + "non-octal character in \\o{} (closing brace missing?)\0" + /* 65 */ + "different names for subpatterns of the same number are not allowed\0" + "(*MARK) must have an argument\0" + "non-hex character in \\x{} (closing brace missing?)\0" +#ifndef EBCDIC + "\\c must be followed by a printable ASCII character\0" +#else + "\\c must be followed by a letter or one of [\\]^_?\0" +#endif + "\\k is not followed by a braced, angle-bracketed, or quoted name\0" + /* 70 */ + "internal error: unknown meta code in check_lookbehinds()\0" + "\\N is not supported in a class\0" + "callout string is too long\0" + "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" + "using UTF is disabled by the application\0" + /* 75 */ + "using UCP is disabled by the application\0" + "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" + "character code point value in \\u.... sequence is too large\0" + "digits missing in \\x{} or \\o{} or \\N{U+}\0" + "syntax error or number too big in (?(VERSION condition\0" + /* 80 */ + "internal error: unknown opcode in auto_possessify()\0" + "missing terminating delimiter for callout with string argument\0" + "unrecognized string delimiter follows (?C\0" + "using \\C is disabled by the application\0" + "(?| and/or (?J: or (?x: parentheses are too deeply nested\0" + /* 85 */ + "using \\C is disabled in this PCRE2 library\0" + "regular expression is too complicated\0" + "lookbehind assertion is too long\0" + "pattern string is longer than the limit set by the application\0" + "internal error: unknown code in parsed pattern\0" + /* 90 */ + "internal error: bad code value in parsed_skip()\0" + "PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0" + "invalid option bits with PCRE2_LITERAL\0" + "\\N{U+dddd} is supported only in Unicode (UTF) mode\0" + "invalid hyphen in option setting\0" + /* 95 */ + "(*alpha_assertion) not recognized\0" + "script runs require Unicode support, which this version of PCRE2 does not have\0" + "too many capturing groups (maximum 65535)\0" + "atomic assertion expected after (?( or (?(?C)\0" + "\\K is not allowed in lookarounds (but see PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)\0" + ; + +/* Match-time and UTF error texts are in the same format. */ + +static const unsigned char match_error_texts[] = + "no error\0" + "no match\0" + "partial match\0" + "UTF-8 error: 1 byte missing at end\0" + "UTF-8 error: 2 bytes missing at end\0" + /* 5 */ + "UTF-8 error: 3 bytes missing at end\0" + "UTF-8 error: 4 bytes missing at end\0" + "UTF-8 error: 5 bytes missing at end\0" + "UTF-8 error: byte 2 top bits not 0x80\0" + "UTF-8 error: byte 3 top bits not 0x80\0" + /* 10 */ + "UTF-8 error: byte 4 top bits not 0x80\0" + "UTF-8 error: byte 5 top bits not 0x80\0" + "UTF-8 error: byte 6 top bits not 0x80\0" + "UTF-8 error: 5-byte character is not allowed (RFC 3629)\0" + "UTF-8 error: 6-byte character is not allowed (RFC 3629)\0" + /* 15 */ + "UTF-8 error: code points greater than 0x10ffff are not defined\0" + "UTF-8 error: code points 0xd800-0xdfff are not defined\0" + "UTF-8 error: overlong 2-byte sequence\0" + "UTF-8 error: overlong 3-byte sequence\0" + "UTF-8 error: overlong 4-byte sequence\0" + /* 20 */ + "UTF-8 error: overlong 5-byte sequence\0" + "UTF-8 error: overlong 6-byte sequence\0" + "UTF-8 error: isolated byte with 0x80 bit set\0" + "UTF-8 error: illegal byte (0xfe or 0xff)\0" + "UTF-16 error: missing low surrogate at end\0" + /* 25 */ + "UTF-16 error: invalid low surrogate\0" + "UTF-16 error: isolated low surrogate\0" + "UTF-32 error: code points 0xd800-0xdfff are not defined\0" + "UTF-32 error: code points greater than 0x10ffff are not defined\0" + "bad data value\0" + /* 30 */ + "patterns do not all use the same character tables\0" + "magic number missing\0" + "pattern compiled in wrong mode: 8/16/32-bit error\0" + "bad offset value\0" + "bad option value\0" + /* 35 */ + "invalid replacement string\0" + "bad offset into UTF string\0" + "callout error code\0" /* Never returned by PCRE2 itself */ + "invalid data in workspace for DFA restart\0" + "too much recursion for DFA matching\0" + /* 40 */ + "backreference condition or recursion test is not supported for DFA matching\0" + "function is not supported for DFA matching\0" + "pattern contains an item that is not supported for DFA matching\0" + "workspace size exceeded in DFA matching\0" + "internal error - pattern overwritten?\0" + /* 45 */ + "bad JIT option\0" + "JIT stack limit reached\0" + "match limit exceeded\0" + "no more memory\0" + "unknown substring\0" + /* 50 */ + "non-unique substring name\0" + "NULL argument passed with non-zero length\0" + "nested recursion at the same subject position\0" + "matching depth limit exceeded\0" + "requested value is not available\0" + /* 55 */ + "requested value is not set\0" + "offset limit set without PCRE2_USE_OFFSET_LIMIT\0" + "bad escape sequence in replacement string\0" + "expected closing curly bracket in replacement string\0" + "bad substitution in replacement string\0" + /* 60 */ + "match with end before start or start moved backwards is not supported\0" + "too many replacements (more than INT_MAX)\0" + "bad serialized data\0" + "heap limit exceeded\0" + "invalid syntax\0" + /* 65 */ + "internal error - duplicate substitution match\0" + "PCRE2_MATCH_INVALID_UTF is not supported for DFA matching\0" + ; + + +/************************************************* +* Return error message * +*************************************************/ + +/* This function copies an error message into a buffer whose units are of an +appropriate width. Error numbers are positive for compile-time errors, and +negative for match-time errors (except for UTF errors), but the numbers are all +distinct. + +Arguments: + enumber error number + buffer where to put the message (zero terminated) + size size of the buffer in code units + +Returns: length of message if all is well + negative on error +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_get_error_message(int enumber, PCRE2_UCHAR *buffer, PCRE2_SIZE size) +{ +const unsigned char *message; +PCRE2_SIZE i; +int n; + +if (size == 0) return PCRE2_ERROR_NOMEMORY; + +if (enumber >= COMPILE_ERROR_BASE) /* Compile error */ + { + message = compile_error_texts; + n = enumber - COMPILE_ERROR_BASE; + } +else if (enumber < 0) /* Match or UTF error */ + { + message = match_error_texts; + n = -enumber; + } +else /* Invalid error number */ + { + message = (unsigned char *)"\0"; /* Empty message list */ + n = 1; + } + +for (; n > 0; n--) + { + while (*message++ != CHAR_NUL) {}; + if (*message == CHAR_NUL) return PCRE2_ERROR_BADDATA; + } + +for (i = 0; *message != 0; i++) + { + if (i >= size - 1) + { + buffer[i] = 0; /* Terminate partial message */ + return PCRE2_ERROR_NOMEMORY; + } + buffer[i] = *message++; + } + +buffer[i] = 0; +return (int)i; +} + +/* End of pcre2_error.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_extuni.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_extuni.c new file mode 100644 index 0000000000..a8ebf90755 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_extuni.c @@ -0,0 +1,148 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains an internal function that is used to match a Unicode +extended grapheme sequence. It is used by both pcre2_match() and +pcre2_def_match(). However, it is called only when Unicode support is being +compiled. Nevertheless, we provide a dummy function when there is no Unicode +support, because some compilers do not like functionless source files. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + + +#include "regexp/pcre2/pcre2_internal.h" + + +/* Dummy function */ + +#ifndef SUPPORT_UNICODE +PCRE2_SPTR +PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, + PCRE2_SPTR end_subject, BOOL utf, int *xcount) +{ +(void)c; +(void)eptr; +(void)start_subject; +(void)end_subject; +(void)utf; +(void)xcount; +return NULL; +} +#else + + +/************************************************* +* Match an extended grapheme sequence * +*************************************************/ + +/* +Arguments: + c the first character + eptr pointer to next character + start_subject pointer to start of subject + end_subject pointer to end of subject + utf TRUE if in UTF mode + xcount pointer to count of additional characters, + or NULL if count not needed + +Returns: pointer after the end of the sequence +*/ + +PCRE2_SPTR +PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, + PCRE2_SPTR end_subject, BOOL utf, int *xcount) +{ +int lgb = UCD_GRAPHBREAK(c); + +while (eptr < end_subject) + { + int rgb; + int len = 1; + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } + rgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; + + /* Not breaking between Regional Indicators is allowed only if there + are an even number of preceding RIs. */ + + if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) + { + int ricount = 0; + PCRE2_SPTR bptr = eptr - 1; + if (utf) BACKCHAR(bptr); + + /* bptr is pointing to the left-hand character */ + + while (bptr > start_subject) + { + bptr--; + if (utf) + { + BACKCHAR(bptr); + GETCHAR(c, bptr); + } + else + c = *bptr; + if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; + ricount++; + } + if ((ricount & 1) != 0) break; /* Grapheme break required */ + } + + /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this + allows any number of them before a following Extended_Pictographic. */ + + if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || + lgb != ucp_gbExtended_Pictographic) + lgb = rgb; + + eptr += len; + if (xcount != NULL) *xcount += 1; + } + +return eptr; +} + +#endif /* SUPPORT_UNICODE */ + +/* End of pcre2_extuni.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_find_bracket.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_find_bracket.c new file mode 100644 index 0000000000..09c39b0b53 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_find_bracket.c @@ -0,0 +1,219 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2018 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains a single function that scans through a compiled pattern +until it finds a capturing bracket with the given number, or, if the number is +negative, an instance of OP_REVERSE for a lookbehind. The function is called +from pcre2_compile.c and also from pcre2_study.c when finding the minimum +matching length. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + +/************************************************* +* Scan compiled regex for specific bracket * +*************************************************/ + +/* +Arguments: + code points to start of expression + utf TRUE in UTF mode + number the required bracket number or negative to find a lookbehind + +Returns: pointer to the opcode for the bracket, or NULL if not found +*/ + +PCRE2_SPTR +PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) +{ +for (;;) + { + PCRE2_UCHAR c = *code; + + if (c == OP_END) return NULL; + + /* XCLASS is used for classes that cannot be represented just by a bit map. + This includes negated single high-valued characters. CALLOUT_STR is used for + callouts with string arguments. In both cases the length in the table is + zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); + + /* Handle lookbehind */ + + else if (c == OP_REVERSE) + { + if (number < 0) return (PCRE2_UCHAR *)code; + code += PRIV(OP_lengths)[c]; + } + + /* Handle capturing bracket */ + + else if (c == OP_CBRA || c == OP_SCBRA || + c == OP_CBRAPOS || c == OP_SCBRAPOS) + { + int n = (int)GET2(code, 1+LINK_SIZE); + if (n == number) return (PCRE2_UCHAR *)code; + code += PRIV(OP_lengths)[c]; + } + + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ + + else + { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) + code += 2; + break; + + case OP_MARK: + case OP_COMMIT_ARG: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; + } + + /* Add in the fixed length from the table */ + + code += PRIV(OP_lengths)[c]; + + /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be + followed by a multi-byte character. The length in the table is a minimum, so + we have to arrange to skip the extra bytes. */ + +#ifdef MAYBE_UTF_MULTI + if (utf) switch(c) + { + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + case OP_UPTO: + case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_MINUPTO: + case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_POSUPTO: + case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + case OP_STAR: + case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_MINSTAR: + case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_POSSTAR: + case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_PLUS: + case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_MINPLUS: + case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_POSPLUS: + case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + case OP_QUERY: + case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_MINQUERY: + case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_POSQUERY: + case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); + break; + } +#else + (void)(utf); /* Keep compiler happy by referencing function argument */ +#endif /* MAYBE_UTF_MULTI */ + } + } +} + +/* End of pcre2_find_bracket.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_fuzzsupport.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_fuzzsupport.c new file mode 100644 index 0000000000..274039ecc5 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_fuzzsupport.c @@ -0,0 +1,369 @@ +/*************************************************************************** +Fuzzer driver for PCRE2. Given an arbitrary string of bytes and a length, it +tries to compile and match it, deriving options from the string itself. If +STANDALONE is defined, a main program that calls the driver with the contents +of specified files is compiled, and commentary on what is happening is output. +If an argument starts with '=' the rest of it it is taken as a literal string +rather than a file name. This allows easy testing of short strings. + +Written by Philip Hazel, October 2016 +***************************************************************************/ + +#include +#include +#include +#include + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include "regexp/pcre2/pcre2.h" + +#define MAX_MATCH_SIZE 1000 + +#define DFA_WORKSPACE_COUNT 100 + +#define ALLOWED_COMPILE_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ + PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ + PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_ENDANCHORED|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ + PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ + PCRE2_NO_AUTO_CAPTURE| \ + PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ + PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ + PCRE2_UTF) + +#define ALLOWED_MATCH_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ + PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_HARD| \ + PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) + +/* This is the callout function. Its only purpose is to halt matching if there +are more than 100 callouts, as one way of stopping too much time being spent on +fruitless matches. The callout data is a pointer to the counter. */ + +static int callout_function(pcre2_callout_block *cb, void *callout_data) +{ +(void)cb; /* Avoid unused parameter warning */ +*((uint32_t *)callout_data) += 1; +return (*((uint32_t *)callout_data) > 100)? PCRE2_ERROR_CALLOUT : 0; +} + +/* Putting in this apparently unnecessary prototype prevents gcc from giving a +"no previous prototype" warning when compiling at high warning level. */ + +int LLVMFuzzerTestOneInput(const unsigned char *, size_t); + +/* Here's the driving function. */ + +int LLVMFuzzerTestOneInput(const unsigned char *data, size_t size) +{ +uint32_t compile_options; +uint32_t match_options; +pcre2_match_data *match_data = NULL; +pcre2_match_context *match_context = NULL; +size_t match_size; +int dfa_workspace[DFA_WORKSPACE_COUNT]; +int r1, r2; +int i; + +if (size < 1) return 0; + +/* Limiting the length of the subject for matching stops fruitless searches +in large trees taking too much time. */ + +match_size = (size > MAX_MATCH_SIZE)? MAX_MATCH_SIZE : size; + +/* Figure out some options to use. Initialize the random number to ensure +repeatability. Ensure that we get a 32-bit unsigned random number for testing +options. (RAND_MAX is required to be at least 32767, but is commonly +2147483647, which excludes the top bit.) */ + +srand((unsigned int)(data[size/2])); +r1 = rand(); +r2 = rand(); + +/* Ensure that all undefined option bits are zero (waste of time trying them) +and also that PCRE2_NO_UTF_CHECK is unset, as there is no guarantee that the +input is UTF-8. Also unset PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as there is no +reason to disallow UTF and UCP. Force PCRE2_NEVER_BACKSLASH_C to be set because +\C in random patterns is highly likely to cause a crash. */ + +compile_options = + ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_COMPILE_OPTIONS) | + PCRE2_NEVER_BACKSLASH_C; + +match_options = + ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_MATCH_OPTIONS); + +/* Discard partial matching if PCRE2_ENDANCHORED is set, because they are not +allowed together and just give an immediate error return. */ + +if (((compile_options|match_options) & PCRE2_ENDANCHORED) != 0) + match_options &= ~(PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT); + +/* Do the compile with and without the options, and after a successful compile, +likewise do the match with and without the options. */ + +for (i = 0; i < 2; i++) + { + uint32_t callout_count; + int errorcode; + PCRE2_SIZE erroroffset; + pcre2_code *code; + +#ifdef STANDALONE + printf("Compile options %.8x never_backslash_c", compile_options); + printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + ((compile_options & PCRE2_ALT_BSUX) != 0)? ",alt_bsux" : "", + ((compile_options & PCRE2_ALT_CIRCUMFLEX) != 0)? ",alt_circumflex" : "", + ((compile_options & PCRE2_ALT_VERBNAMES) != 0)? ",alt_verbnames" : "", + ((compile_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? ",allow_empty_class" : "", + ((compile_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", + ((compile_options & PCRE2_AUTO_CALLOUT) != 0)? ",auto_callout" : "", + ((compile_options & PCRE2_CASELESS) != 0)? ",caseless" : "", + ((compile_options & PCRE2_DOLLAR_ENDONLY) != 0)? ",dollar_endonly" : "", + ((compile_options & PCRE2_DOTALL) != 0)? ",dotall" : "", + ((compile_options & PCRE2_DUPNAMES) != 0)? ",dupnames" : "", + ((compile_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", + ((compile_options & PCRE2_EXTENDED) != 0)? ",extended" : "", + ((compile_options & PCRE2_FIRSTLINE) != 0)? ",firstline" : "", + ((compile_options & PCRE2_MATCH_UNSET_BACKREF) != 0)? ",match_unset_backref" : "", + ((compile_options & PCRE2_MULTILINE) != 0)? ",multiline" : "", + ((compile_options & PCRE2_NEVER_UCP) != 0)? ",never_ucp" : "", + ((compile_options & PCRE2_NEVER_UTF) != 0)? ",never_utf" : "", + ((compile_options & PCRE2_NO_AUTO_CAPTURE) != 0)? ",no_auto_capture" : "", + ((compile_options & PCRE2_NO_AUTO_POSSESS) != 0)? ",no_auto_possess" : "", + ((compile_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? ",no_dotstar_anchor" : "", + ((compile_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", + ((compile_options & PCRE2_NO_START_OPTIMIZE) != 0)? ",no_start_optimize" : "", + ((compile_options & PCRE2_UCP) != 0)? ",ucp" : "", + ((compile_options & PCRE2_UNGREEDY) != 0)? ",ungreedy" : "", + ((compile_options & PCRE2_USE_OFFSET_LIMIT) != 0)? ",use_offset_limit" : "", + ((compile_options & PCRE2_UTF) != 0)? ",utf" : ""); +#endif + + code = pcre2_compile((PCRE2_SPTR)data, (PCRE2_SIZE)size, compile_options, + &errorcode, &erroroffset, NULL); + + /* Compilation succeeded */ + + if (code != NULL) + { + int j; + uint32_t save_match_options = match_options; + +#ifdef SUPPORT_JIT + pcre2_jit_compile(code, PCRE2_JIT_COMPLETE); +#endif + + /* Create match data and context blocks only when we first need them. Set + low match and depth limits to avoid wasting too much searching large + pattern trees. Almost all matches are going to fail. */ + + if (match_data == NULL) + { + match_data = pcre2_match_data_create(32, NULL); + if (match_data == NULL) + { +#ifdef STANDALONE + printf("** Failed to create match data block\n"); +#endif + return 0; + } + } + + if (match_context == NULL) + { + match_context = pcre2_match_context_create(NULL); + if (match_context == NULL) + { +#ifdef STANDALONE + printf("** Failed to create match context block\n"); +#endif + return 0; + } + (void)pcre2_set_match_limit(match_context, 100); + (void)pcre2_set_depth_limit(match_context, 100); + (void)pcre2_set_callout(match_context, callout_function, &callout_count); + } + + /* Match twice, with and without options. */ + + for (j = 0; j < 2; j++) + { +#ifdef STANDALONE + printf("Match options %.8x", match_options); + printf("%s%s%s%s%s%s%s%s%s%s\n", + ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", + ((match_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", + ((match_options & PCRE2_NO_JIT) != 0)? ",no_jit" : "", + ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", + ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", + ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", + ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", + ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", + ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", + ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); +#endif + + callout_count = 0; + errorcode = pcre2_match(code, (PCRE2_SPTR)data, (PCRE2_SIZE)match_size, 0, + match_options, match_data, match_context); + +#ifdef STANDALONE + if (errorcode >= 0) printf("Match returned %d\n", errorcode); else + { + unsigned char buffer[256]; + pcre2_get_error_message(errorcode, buffer, 256); + printf("Match failed: error %d: %s\n", errorcode, buffer); + } +#endif + + match_options = 0; /* For second time */ + } + + /* Match with DFA twice, with and without options. */ + + match_options = save_match_options & ~PCRE2_NO_JIT; /* Not valid for DFA */ + + for (j = 0; j < 2; j++) + { +#ifdef STANDALONE + printf("DFA match options %.8x", match_options); + printf("%s%s%s%s%s%s%s%s%s\n", + ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", + ((match_options & PCRE2_ENDANCHORED) != 0)? ",endanchored" : "", + ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", + ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", + ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", + ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", + ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", + ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", + ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); +#endif + + callout_count = 0; + errorcode = pcre2_dfa_match(code, (PCRE2_SPTR)data, + (PCRE2_SIZE)match_size, 0, match_options, match_data, match_context, + dfa_workspace, DFA_WORKSPACE_COUNT); + +#ifdef STANDALONE + if (errorcode >= 0) printf("Match returned %d\n", errorcode); else + { + unsigned char buffer[256]; + pcre2_get_error_message(errorcode, buffer, 256); + printf("Match failed: error %d: %s\n", errorcode, buffer); + } +#endif + + match_options = 0; /* For second time */ + } + + match_options = save_match_options; /* Reset for the second compile */ + pcre2_code_free(code); + } + + /* Compilation failed */ + + else + { + unsigned char buffer[256]; + pcre2_get_error_message(errorcode, buffer, 256); +#ifdef STANDALONE + printf("Error %d at offset %lu: %s\n", errorcode, erroroffset, buffer); +#else + if (strstr((const char *)buffer, "internal error") != NULL) abort(); +#endif + } + + compile_options = PCRE2_NEVER_BACKSLASH_C; /* For second time */ + } + +if (match_data != NULL) pcre2_match_data_free(match_data); +if (match_context != NULL) pcre2_match_context_free(match_context); + +return 0; +} + + +/* Optional main program. */ + +#ifdef STANDALONE +int main(int argc, char **argv) +{ +int i; + +if (argc < 2) + { + printf("** No arguments given\n"); + return 0; + } + +for (i = 1; i < argc; i++) + { + size_t filelen; + size_t readsize; + unsigned char *buffer; + FILE *f; + + /* Handle a literal string. Copy to an exact size buffer so that checks for + overrunning work. */ + + if (argv[i][0] == '=') + { + readsize = strlen(argv[i]) - 1; + printf("------ ------\n"); + printf("Length = %lu\n", readsize); + printf("%.*s\n", (int)readsize, argv[i]+1); + buffer = (unsigned char *)malloc(readsize); + if (buffer == NULL) + printf("** Failed to allocate %lu bytes of memory\n", readsize); + else + { + memcpy(buffer, argv[i]+1, readsize); + LLVMFuzzerTestOneInput(buffer, readsize); + free(buffer); + } + continue; + } + + /* Handle a string given in a file */ + + f = fopen(argv[i], "rb"); + if (f == NULL) + { + printf("** Failed to open %s: %s\n", argv[i], strerror(errno)); + continue; + } + + printf("------ %s ------\n", argv[i]); + + fseek(f, 0, SEEK_END); + filelen = ftell(f); + fseek(f, 0, SEEK_SET); + + buffer = (unsigned char *)malloc(filelen); + if (buffer == NULL) + { + printf("** Failed to allocate %lu bytes of memory\n", filelen); + fclose(f); + continue; + } + + readsize = fread(buffer, 1, filelen, f); + fclose(f); + + if (readsize != filelen) + printf("** File size is %lu but fread() returned %lu\n", filelen, readsize); + else + { + printf("Length = %lu\n", filelen); + LLVMFuzzerTestOneInput(buffer, filelen); + } + free(buffer); + } + +return 0; +} +#endif /* STANDALONE */ + +/* End */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_internal.h b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_internal.h new file mode 100644 index 0000000000..1acea56e4f --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_internal.h @@ -0,0 +1,2047 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE2 is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +#ifndef PCRE2_INTERNAL_H_IDEMPOTENT_GUARD +#define PCRE2_INTERNAL_H_IDEMPOTENT_GUARD + +/* We do not support both EBCDIC and Unicode at the same time. The "configure" +script prevents both being selected, but not everybody uses "configure". EBCDIC +is only supported for the 8-bit library, but the check for this has to be later +in this file, because the first part is not width-dependent, and is included by +pcre2test.c with CODE_UNIT_WIDTH == 0. */ + +#if defined EBCDIC && defined SUPPORT_UNICODE +#error The use of both EBCDIC and SUPPORT_UNICODE is not supported. +#endif + +/* Standard C headers */ + +#include +#include +#include +#include +#include +#include + +/* Macros to make boolean values more obvious. The #ifndef is to pacify +compiler warnings in environments where these macros are defined elsewhere. +Unfortunately, there is no way to do the same for the typedef. */ + +typedef int BOOL; +#ifndef FALSE +#define FALSE 0 +#define TRUE 1 +#endif + +/* Valgrind (memcheck) support */ + +#ifdef SUPPORT_VALGRIND +#include +#endif + +/* -ftrivial-auto-var-init support supports initializing all local variables +to avoid some classes of bug, but this can cause an unacceptable slowdown +for large on-stack arrays in hot functions. This macro lets us annotate +such arrays. */ + +#ifdef HAVE_ATTRIBUTE_UNINITIALIZED +#define PCRE2_KEEP_UNINITIALIZED __attribute__((uninitialized)) +#else +#define PCRE2_KEEP_UNINITIALIZED +#endif + +/* Older versions of MSVC lack snprintf(). This define allows for +warning/error-free compilation and testing with MSVC compilers back to at least +MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define snprintf _snprintf +#endif + +/* When compiling a DLL for Windows, the exported symbols have to be declared +using some MS magic. I found some useful information on this web page: +http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the +information there, using __declspec(dllexport) without "extern" we have a +definition; with "extern" we have a declaration. The settings here override the +setting in pcre2.h (which is included below); it defines only PCRE2_EXP_DECL, +which is all that is needed for applications (they just import the symbols). We +use: + + PCRE2_EXP_DECL for declarations + PCRE2_EXP_DEFN for definitions + +The reason for wrapping this in #ifndef PCRE2_EXP_DECL is so that pcre2test, +which is an application, but needs to import this file in order to "peek" at +internals, can #include pcre2.h first to get an application's-eye view. + +In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, +special-purpose environments) might want to stick other stuff in front of +exported symbols. That's why, in the non-Windows case, we set PCRE2_EXP_DEFN +only if it is not already set. */ + +#ifndef PCRE2_EXP_DECL +# ifdef _WIN32 +# ifndef PCRE2_STATIC +# define PCRE2_EXP_DECL extern __declspec(dllexport) +# define PCRE2_EXP_DEFN __declspec(dllexport) +# else +# define PCRE2_EXP_DECL extern +# define PCRE2_EXP_DEFN +# endif +# else +# ifdef __cplusplus +# define PCRE2_EXP_DECL extern "C" +# else +# define PCRE2_EXP_DECL extern +# endif +# ifndef PCRE2_EXP_DEFN +# define PCRE2_EXP_DEFN PCRE2_EXP_DECL +# endif +# endif +#endif + +/* Include the public PCRE2 header and the definitions of UCP character +property values. This must follow the setting of PCRE2_EXP_DECL above. */ + +#include "regexp/pcre2/pcre2.h" +#include "regexp/pcre2/pcre2_ucp.h" + +/* When PCRE2 is compiled as a C++ library, the subject pointer can be replaced +with a custom type. This makes it possible, for example, to allow pcre2_match() +to process subject strings that are discontinuous by using a smart pointer +class. It must always be possible to inspect all of the subject string in +pcre2_match() because of the way it backtracks. */ + +/* WARNING: This is as yet untested for PCRE2. */ + +#ifdef CUSTOM_SUBJECT_PTR +#undef PCRE2_SPTR +#define PCRE2_SPTR CUSTOM_SUBJECT_PTR +#endif + +/* When checking for integer overflow in pcre2_compile(), we need to handle +large integers. If a 64-bit integer type is available, we can use that. +Otherwise we have to cast to double, which of course requires floating point +arithmetic. Handle this by defining a macro for the appropriate type. */ + +#if defined INT64_MAX || defined int64_t +#define INT64_OR_DOUBLE int64_t +#else +#define INT64_OR_DOUBLE double +#endif + +/* External (in the C sense) functions and tables that are private to the +libraries are always referenced using the PRIV macro. This makes it possible +for pcre2test.c to include some of the source files from the libraries using a +different PRIV definition to avoid name clashes. It also makes it clear in the +code that a non-static object is being referenced. */ + +#ifndef PRIV +#define PRIV(name) _pcre2_##name +#endif + +/* When compiling for use with the Virtual Pascal compiler, these functions +need to have their names changed. PCRE2 must be compiled with the -DVPCOMPAT +option on the command line. */ + +#ifdef VPCOMPAT +#define strlen(s) _strlen(s) +#define strncmp(s1,s2,m) _strncmp(s1,s2,m) +#define memcmp(s,c,n) _memcmp(s,c,n) +#define memcpy(d,s,n) _memcpy(d,s,n) +#define memmove(d,s,n) _memmove(d,s,n) +#define memset(s,c,n) _memset(s,c,n) +#else /* VPCOMPAT */ + +/* Otherwise, to cope with SunOS4 and other systems that lack memmove(), define +a macro that calls an emulating function. */ + +#ifndef HAVE_MEMMOVE +#undef memmove /* Some systems may have a macro */ +#define memmove(a, b, c) PRIV(memmove)(a, b, c) +#endif /* not HAVE_MEMMOVE */ +#endif /* not VPCOMPAT */ + +/* This is an unsigned int value that no UTF character can ever have, as +Unicode doesn't go beyond 0x0010ffff. */ + +#define NOTACHAR 0xffffffff + +/* This is the largest valid UTF/Unicode code point. */ + +#define MAX_UTF_CODE_POINT 0x10ffff + +/* Compile-time positive error numbers (all except UTF errors, which are +negative) start at this value. It should probably never be changed, in case +some application is checking for specific numbers. There is a copy of this +#define in pcre2posix.c (which now no longer includes this file). Ideally, a +way of having a single definition should be found, but as the number is +unlikely to change, this is not a pressing issue. The original reason for +having a base other than 0 was to keep the absolute values of compile-time and +run-time error numbers numerically different, but in the event the code does +not rely on this. */ + +#define COMPILE_ERROR_BASE 100 + +/* The initial frames vector for remembering pcre2_match() backtracking points +is allocated on the heap, of this size (bytes) or ten times the frame size if +larger, unless the heap limit is smaller. Typical frame sizes are a few hundred +bytes (it depends on the number of capturing parentheses) so 20KiB handles +quite a few frames. A larger vector on the heap is obtained for matches that +need more frames, subject to the heap limit. */ + +#define START_FRAMES_SIZE 20480 + +/* For DFA matching, an initial internal workspace vector is allocated on the +stack. The heap is used only if this turns out to be too small. */ + +#define DFA_START_RWS_SIZE 30720 + +/* Define the default BSR convention. */ + +#ifdef BSR_ANYCRLF +#define BSR_DEFAULT PCRE2_BSR_ANYCRLF +#else +#define BSR_DEFAULT PCRE2_BSR_UNICODE +#endif + + +/* ---------------- Basic UTF-8 macros ---------------- */ + +/* These UTF-8 macros are always defined because they are used in pcre2test for +handling wide characters in 16-bit and 32-bit modes, even if an 8-bit library +is not supported. */ + +/* Tests whether a UTF-8 code point needs extra bytes to decode. */ + +#define HASUTF8EXTRALEN(c) ((c) >= 0xc0) + +/* The following macros were originally written in the form of loops that used +data from the tables whose names start with PRIV(utf8_table). They were +rewritten by a user so as not to use loops, because in some environments this +gives a significant performance advantage, and it seems never to do any harm. +*/ + +/* Base macro to pick up the remaining bytes of a UTF-8 character, not +advancing the pointer. */ + +#define GETUTF8(c, eptr) \ + { \ + if ((c & 0x20u) == 0) \ + c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ + else if ((c & 0x10u) == 0) \ + c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ + else if ((c & 0x08u) == 0) \ + c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ + ((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ + else if ((c & 0x04u) == 0) \ + c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ + ((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ + (eptr[4] & 0x3fu); \ + else \ + c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ + ((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ + ((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ + } + +/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing +the pointer. */ + +#define GETUTF8INC(c, eptr) \ + { \ + if ((c & 0x20u) == 0) \ + c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \ + else if ((c & 0x10u) == 0) \ + { \ + c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \ + eptr += 2; \ + } \ + else if ((c & 0x08u) == 0) \ + { \ + c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \ + ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ + eptr += 3; \ + } \ + else if ((c & 0x04u) == 0) \ + { \ + c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \ + ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \ + (eptr[3] & 0x3fu); \ + eptr += 4; \ + } \ + else \ + { \ + c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \ + ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \ + ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \ + eptr += 5; \ + } \ + } + +/* Base macro to pick up the remaining bytes of a UTF-8 character, not +advancing the pointer, incrementing the length. */ + +#define GETUTF8LEN(c, eptr, len) \ + { \ + if ((c & 0x20u) == 0) \ + { \ + c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ + len++; \ + } \ + else if ((c & 0x10u) == 0) \ + { \ + c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ + len += 2; \ + } \ + else if ((c & 0x08u) == 0) \ + {\ + c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ + ((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ + len += 3; \ + } \ + else if ((c & 0x04u) == 0) \ + { \ + c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ + ((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ + (eptr[4] & 0x3fu); \ + len += 4; \ + } \ + else \ + {\ + c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ + ((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ + ((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ + len += 5; \ + } \ + } + +/* --------------- Whitespace macros ---------------- */ + +/* Tests for Unicode horizontal and vertical whitespace characters must check a +number of different values. Using a switch statement for this generates the +fastest code (no loop, no memory access), and there are several places in the +interpreter code where this happens. In order to ensure that all the case lists +remain in step, we use macros so that there is only one place where the lists +are defined. + +These values are also required as lists in pcre2_compile.c when processing \h, +\H, \v and \V in a character class. The lists are defined in pcre2_tables.c, +but macros that define the values are here so that all the definitions are +together. The lists must be in ascending character order, terminated by +NOTACHAR (which is 0xffffffff). + +Any changes should ensure that the various macros are kept in step with each +other. NOTE: The values also appear in pcre2_jit_compile.c. */ + +/* -------------- ASCII/Unicode environments -------------- */ + +#ifndef EBCDIC + +/* Character U+180E (Mongolian Vowel Separator) is not included in the list of +spaces in the Unicode file PropList.txt, and Perl does not recognize it as a +space. However, in many other sources it is listed as a space and has been in +PCRE (both APIs) for a long time. */ + +#define HSPACE_LIST \ + CHAR_HT, CHAR_SPACE, CHAR_NBSP, \ + 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \ + 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \ + NOTACHAR + +#define HSPACE_MULTIBYTE_CASES \ + case 0x1680: /* OGHAM SPACE MARK */ \ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \ + case 0x2000: /* EN QUAD */ \ + case 0x2001: /* EM QUAD */ \ + case 0x2002: /* EN SPACE */ \ + case 0x2003: /* EM SPACE */ \ + case 0x2004: /* THREE-PER-EM SPACE */ \ + case 0x2005: /* FOUR-PER-EM SPACE */ \ + case 0x2006: /* SIX-PER-EM SPACE */ \ + case 0x2007: /* FIGURE SPACE */ \ + case 0x2008: /* PUNCTUATION SPACE */ \ + case 0x2009: /* THIN SPACE */ \ + case 0x200A: /* HAIR SPACE */ \ + case 0x202f: /* NARROW NO-BREAK SPACE */ \ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \ + case 0x3000 /* IDEOGRAPHIC SPACE */ + +#define HSPACE_BYTE_CASES \ + case CHAR_HT: \ + case CHAR_SPACE: \ + case CHAR_NBSP + +#define HSPACE_CASES \ + HSPACE_BYTE_CASES: \ + HSPACE_MULTIBYTE_CASES + +#define VSPACE_LIST \ + CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR + +#define VSPACE_MULTIBYTE_CASES \ + case 0x2028: /* LINE SEPARATOR */ \ + case 0x2029 /* PARAGRAPH SEPARATOR */ + +#define VSPACE_BYTE_CASES \ + case CHAR_LF: \ + case CHAR_VT: \ + case CHAR_FF: \ + case CHAR_CR: \ + case CHAR_NEL + +#define VSPACE_CASES \ + VSPACE_BYTE_CASES: \ + VSPACE_MULTIBYTE_CASES + +/* -------------- EBCDIC environments -------------- */ + +#else +#define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR + +#define HSPACE_BYTE_CASES \ + case CHAR_HT: \ + case CHAR_SPACE: \ + case CHAR_NBSP + +#define HSPACE_CASES HSPACE_BYTE_CASES + +#ifdef EBCDIC_NL25 +#define VSPACE_LIST \ + CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR +#else +#define VSPACE_LIST \ + CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR +#endif + +#define VSPACE_BYTE_CASES \ + case CHAR_LF: \ + case CHAR_VT: \ + case CHAR_FF: \ + case CHAR_CR: \ + case CHAR_NEL + +#define VSPACE_CASES VSPACE_BYTE_CASES +#endif /* EBCDIC */ + +/* -------------- End of whitespace macros -------------- */ + + +/* PCRE2 is able to support several different kinds of newline (CR, LF, CRLF, +"any" and "anycrlf" at present). The following macros are used to package up +testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various +modules to indicate in which datablock the parameters exist, and what the +start/end of string field names are. */ + +#define NLTYPE_FIXED 0 /* Newline is a fixed length string */ +#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ +#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ + +/* This macro checks for a newline at the given position */ + +#define IS_NEWLINE(p) \ + ((NLBLOCK->nltype != NLTYPE_FIXED)? \ + ((p) < NLBLOCK->PSEND && \ + PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \ + &(NLBLOCK->nllen), utf)) \ + : \ + ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ + UCHAR21TEST(p) == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || UCHAR21TEST(p+1) == NLBLOCK->nl[1]) \ + ) \ + ) + +/* This macro checks for a newline immediately preceding the given position */ + +#define WAS_NEWLINE(p) \ + ((NLBLOCK->nltype != NLTYPE_FIXED)? \ + ((p) > NLBLOCK->PSSTART && \ + PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ + &(NLBLOCK->nllen), utf)) \ + : \ + ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ + UCHAR21TEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || UCHAR21TEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \ + ) \ + ) + +/* Private flags containing information about the compiled pattern. The first +three must not be changed, because whichever is set is actually the number of +bytes in a code unit in that mode. */ + +#define PCRE2_MODE8 0x00000001 /* compiled in 8 bit mode */ +#define PCRE2_MODE16 0x00000002 /* compiled in 16 bit mode */ +#define PCRE2_MODE32 0x00000004 /* compiled in 32 bit mode */ +#define PCRE2_FIRSTSET 0x00000010 /* first_code unit is set */ +#define PCRE2_FIRSTCASELESS 0x00000020 /* caseless first code unit */ +#define PCRE2_FIRSTMAPSET 0x00000040 /* bitmap of first code units is set */ +#define PCRE2_LASTSET 0x00000080 /* last code unit is set */ +#define PCRE2_LASTCASELESS 0x00000100 /* caseless last code unit */ +#define PCRE2_STARTLINE 0x00000200 /* start after \n for multiline */ +#define PCRE2_JCHANGED 0x00000400 /* j option used in pattern */ +#define PCRE2_HASCRORLF 0x00000800 /* explicit \r or \n in pattern */ +#define PCRE2_HASTHEN 0x00001000 /* pattern contains (*THEN) */ +#define PCRE2_MATCH_EMPTY 0x00002000 /* pattern can match empty string */ +#define PCRE2_BSR_SET 0x00004000 /* BSR was set in the pattern */ +#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */ +#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */ +#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */ +#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */ +#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */ +#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */ +#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */ +#define PCRE2_HASBKC 0x00400000 /* contains \C */ +#define PCRE2_HASACCEPT 0x00800000 /* contains (*ACCEPT) */ + +#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) + +/* Values for the matchedby field in a match data block. */ + +enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */ + PCRE2_MATCHEDBY_DFA_INTERPRETER, /* pcre2_dfa_match() */ + PCRE2_MATCHEDBY_JIT }; /* pcre2_jit_match() */ + +/* Values for the flags field in a match data block. */ + +#define PCRE2_MD_COPIED_SUBJECT 0x01u + +/* Magic number to provide a small check against being handed junk. */ + +#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ + +/* The maximum remaining length of subject we are prepared to search for a +req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is +much faster than the search loop that has to be used in 16-bit and 32-bit +modes. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define REQ_CU_MAX 5000 +#else +#define REQ_CU_MAX 2000 +#endif + +/* Offsets for the bitmap tables in the cbits set of tables. Each table +contains a set of bits for a class map. Some classes are built by combining +these tables. */ + +#define cbit_space 0 /* [:space:] or \s */ +#define cbit_xdigit 32 /* [:xdigit:] */ +#define cbit_digit 64 /* [:digit:] or \d */ +#define cbit_upper 96 /* [:upper:] */ +#define cbit_lower 128 /* [:lower:] */ +#define cbit_word 160 /* [:word:] or \w */ +#define cbit_graph 192 /* [:graph:] */ +#define cbit_print 224 /* [:print:] */ +#define cbit_punct 256 /* [:punct:] */ +#define cbit_cntrl 288 /* [:cntrl:] */ +#define cbit_length 320 /* Length of the cbits table */ + +/* Bit definitions for entries in the ctypes table. Do not change these values +without checking pcre2_jit_compile.c, which has an assertion to ensure that +ctype_word has the value 16. */ + +#define ctype_space 0x01 +#define ctype_letter 0x02 +#define ctype_lcletter 0x04 +#define ctype_digit 0x08 +#define ctype_word 0x10 /* alphanumeric or '_' */ + +/* Offsets of the various tables from the base tables pointer, and +total length of the tables. */ + +#define lcc_offset 0 /* Lower case */ +#define fcc_offset 256 /* Flip case */ +#define cbits_offset 512 /* Character classes */ +#define ctypes_offset (cbits_offset + cbit_length) /* Character types */ +#define TABLES_LENGTH (ctypes_offset + 256) + + +/* -------------------- Character and string names ------------------------ */ + +/* If PCRE2 is to support UTF-8 on EBCDIC platforms, we cannot use normal +character constants like '*' because the compiler would emit their EBCDIC code, +which is different from their ASCII/UTF-8 code. Instead we define macros for +the characters so that they always use the ASCII/UTF-8 code when UTF-8 support +is enabled. When UTF-8 support is not enabled, the definitions use character +literals. Both character and string versions of each character are needed, and +there are some longer strings as well. + +This means that, on EBCDIC platforms, the PCRE2 library can handle either +EBCDIC, or UTF-8, but not both. To support both in the same compiled library +would need different lookups depending on whether PCRE2_UTF was set or not. +This would make it impossible to use characters in switch/case statements, +which would reduce performance. For a theoretical use (which nobody has asked +for) in a minority area (EBCDIC platforms), this is not sensible. Any +application that did need both could compile two versions of the library, using +macros to give the functions distinct names. */ + +#ifndef SUPPORT_UNICODE + +/* UTF-8 support is not enabled; use the platform-dependent character literals +so that PCRE2 works in both ASCII and EBCDIC environments, but only in non-UTF +mode. Newline characters are problematic in EBCDIC. Though it has CR and LF +characters, a common practice has been to use its NL (0x15) character as the +line terminator in C-like processing environments. However, sometimes the LF +(0x25) character is used instead, according to this Unicode document: + +http://unicode.org/standard/reports/tr13/tr13-5.html + +PCRE2 defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 +instead. Whichever is *not* chosen is defined as NEL. + +In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the +same code point. */ + +#ifdef EBCDIC + +#ifndef EBCDIC_NL25 +#define CHAR_NL '\x15' +#define CHAR_NEL '\x25' +#define STR_NL "\x15" +#define STR_NEL "\x25" +#else +#define CHAR_NL '\x25' +#define CHAR_NEL '\x15' +#define STR_NL "\x25" +#define STR_NEL "\x15" +#endif + +#define CHAR_LF CHAR_NL +#define STR_LF STR_NL + +#define CHAR_ESC '\047' +#define CHAR_DEL '\007' +#define CHAR_NBSP ((unsigned char)'\x41') +#define STR_ESC "\047" +#define STR_DEL "\007" + +#else /* Not EBCDIC */ + +/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for +compatibility. NEL is the Unicode newline character; make sure it is +a positive value. */ + +#define CHAR_LF '\n' +#define CHAR_NL CHAR_LF +#define CHAR_NEL ((unsigned char)'\x85') +#define CHAR_ESC '\033' +#define CHAR_DEL '\177' +#define CHAR_NBSP ((unsigned char)'\xa0') + +#define STR_LF "\n" +#define STR_NL STR_LF +#define STR_NEL "\x85" +#define STR_ESC "\033" +#define STR_DEL "\177" + +#endif /* EBCDIC */ + +/* The remaining definitions work in both environments. */ + +#define CHAR_NUL '\0' +#define CHAR_HT '\t' +#define CHAR_VT '\v' +#define CHAR_FF '\f' +#define CHAR_CR '\r' +#define CHAR_BS '\b' +#define CHAR_BEL '\a' + +#define CHAR_SPACE ' ' +#define CHAR_EXCLAMATION_MARK '!' +#define CHAR_QUOTATION_MARK '"' +#define CHAR_NUMBER_SIGN '#' +#define CHAR_DOLLAR_SIGN '$' +#define CHAR_PERCENT_SIGN '%' +#define CHAR_AMPERSAND '&' +#define CHAR_APOSTROPHE '\'' +#define CHAR_LEFT_PARENTHESIS '(' +#define CHAR_RIGHT_PARENTHESIS ')' +#define CHAR_ASTERISK '*' +#define CHAR_PLUS '+' +#define CHAR_COMMA ',' +#define CHAR_MINUS '-' +#define CHAR_DOT '.' +#define CHAR_SLASH '/' +#define CHAR_0 '0' +#define CHAR_1 '1' +#define CHAR_2 '2' +#define CHAR_3 '3' +#define CHAR_4 '4' +#define CHAR_5 '5' +#define CHAR_6 '6' +#define CHAR_7 '7' +#define CHAR_8 '8' +#define CHAR_9 '9' +#define CHAR_COLON ':' +#define CHAR_SEMICOLON ';' +#define CHAR_LESS_THAN_SIGN '<' +#define CHAR_EQUALS_SIGN '=' +#define CHAR_GREATER_THAN_SIGN '>' +#define CHAR_QUESTION_MARK '?' +#define CHAR_COMMERCIAL_AT '@' +#define CHAR_A 'A' +#define CHAR_B 'B' +#define CHAR_C 'C' +#define CHAR_D 'D' +#define CHAR_E 'E' +#define CHAR_F 'F' +#define CHAR_G 'G' +#define CHAR_H 'H' +#define CHAR_I 'I' +#define CHAR_J 'J' +#define CHAR_K 'K' +#define CHAR_L 'L' +#define CHAR_M 'M' +#define CHAR_N 'N' +#define CHAR_O 'O' +#define CHAR_P 'P' +#define CHAR_Q 'Q' +#define CHAR_R 'R' +#define CHAR_S 'S' +#define CHAR_T 'T' +#define CHAR_U 'U' +#define CHAR_V 'V' +#define CHAR_W 'W' +#define CHAR_X 'X' +#define CHAR_Y 'Y' +#define CHAR_Z 'Z' +#define CHAR_LEFT_SQUARE_BRACKET '[' +#define CHAR_BACKSLASH '\\' +#define CHAR_RIGHT_SQUARE_BRACKET ']' +#define CHAR_CIRCUMFLEX_ACCENT '^' +#define CHAR_UNDERSCORE '_' +#define CHAR_GRAVE_ACCENT '`' +#define CHAR_a 'a' +#define CHAR_b 'b' +#define CHAR_c 'c' +#define CHAR_d 'd' +#define CHAR_e 'e' +#define CHAR_f 'f' +#define CHAR_g 'g' +#define CHAR_h 'h' +#define CHAR_i 'i' +#define CHAR_j 'j' +#define CHAR_k 'k' +#define CHAR_l 'l' +#define CHAR_m 'm' +#define CHAR_n 'n' +#define CHAR_o 'o' +#define CHAR_p 'p' +#define CHAR_q 'q' +#define CHAR_r 'r' +#define CHAR_s 's' +#define CHAR_t 't' +#define CHAR_u 'u' +#define CHAR_v 'v' +#define CHAR_w 'w' +#define CHAR_x 'x' +#define CHAR_y 'y' +#define CHAR_z 'z' +#define CHAR_LEFT_CURLY_BRACKET '{' +#define CHAR_VERTICAL_LINE '|' +#define CHAR_RIGHT_CURLY_BRACKET '}' +#define CHAR_TILDE '~' + +#define STR_HT "\t" +#define STR_VT "\v" +#define STR_FF "\f" +#define STR_CR "\r" +#define STR_BS "\b" +#define STR_BEL "\a" + +#define STR_SPACE " " +#define STR_EXCLAMATION_MARK "!" +#define STR_QUOTATION_MARK "\"" +#define STR_NUMBER_SIGN "#" +#define STR_DOLLAR_SIGN "$" +#define STR_PERCENT_SIGN "%" +#define STR_AMPERSAND "&" +#define STR_APOSTROPHE "'" +#define STR_LEFT_PARENTHESIS "(" +#define STR_RIGHT_PARENTHESIS ")" +#define STR_ASTERISK "*" +#define STR_PLUS "+" +#define STR_COMMA "," +#define STR_MINUS "-" +#define STR_DOT "." +#define STR_SLASH "/" +#define STR_0 "0" +#define STR_1 "1" +#define STR_2 "2" +#define STR_3 "3" +#define STR_4 "4" +#define STR_5 "5" +#define STR_6 "6" +#define STR_7 "7" +#define STR_8 "8" +#define STR_9 "9" +#define STR_COLON ":" +#define STR_SEMICOLON ";" +#define STR_LESS_THAN_SIGN "<" +#define STR_EQUALS_SIGN "=" +#define STR_GREATER_THAN_SIGN ">" +#define STR_QUESTION_MARK "?" +#define STR_COMMERCIAL_AT "@" +#define STR_A "A" +#define STR_B "B" +#define STR_C "C" +#define STR_D "D" +#define STR_E "E" +#define STR_F "F" +#define STR_G "G" +#define STR_H "H" +#define STR_I "I" +#define STR_J "J" +#define STR_K "K" +#define STR_L "L" +#define STR_M "M" +#define STR_N "N" +#define STR_O "O" +#define STR_P "P" +#define STR_Q "Q" +#define STR_R "R" +#define STR_S "S" +#define STR_T "T" +#define STR_U "U" +#define STR_V "V" +#define STR_W "W" +#define STR_X "X" +#define STR_Y "Y" +#define STR_Z "Z" +#define STR_LEFT_SQUARE_BRACKET "[" +#define STR_BACKSLASH "\\" +#define STR_RIGHT_SQUARE_BRACKET "]" +#define STR_CIRCUMFLEX_ACCENT "^" +#define STR_UNDERSCORE "_" +#define STR_GRAVE_ACCENT "`" +#define STR_a "a" +#define STR_b "b" +#define STR_c "c" +#define STR_d "d" +#define STR_e "e" +#define STR_f "f" +#define STR_g "g" +#define STR_h "h" +#define STR_i "i" +#define STR_j "j" +#define STR_k "k" +#define STR_l "l" +#define STR_m "m" +#define STR_n "n" +#define STR_o "o" +#define STR_p "p" +#define STR_q "q" +#define STR_r "r" +#define STR_s "s" +#define STR_t "t" +#define STR_u "u" +#define STR_v "v" +#define STR_w "w" +#define STR_x "x" +#define STR_y "y" +#define STR_z "z" +#define STR_LEFT_CURLY_BRACKET "{" +#define STR_VERTICAL_LINE "|" +#define STR_RIGHT_CURLY_BRACKET "}" +#define STR_TILDE "~" + +#define STRING_ACCEPT0 "ACCEPT\0" +#define STRING_COMMIT0 "COMMIT\0" +#define STRING_F0 "F\0" +#define STRING_FAIL0 "FAIL\0" +#define STRING_MARK0 "MARK\0" +#define STRING_PRUNE0 "PRUNE\0" +#define STRING_SKIP0 "SKIP\0" +#define STRING_THEN "THEN" + +#define STRING_atomic0 "atomic\0" +#define STRING_pla0 "pla\0" +#define STRING_plb0 "plb\0" +#define STRING_napla0 "napla\0" +#define STRING_naplb0 "naplb\0" +#define STRING_nla0 "nla\0" +#define STRING_nlb0 "nlb\0" +#define STRING_sr0 "sr\0" +#define STRING_asr0 "asr\0" +#define STRING_positive_lookahead0 "positive_lookahead\0" +#define STRING_positive_lookbehind0 "positive_lookbehind\0" +#define STRING_non_atomic_positive_lookahead0 "non_atomic_positive_lookahead\0" +#define STRING_non_atomic_positive_lookbehind0 "non_atomic_positive_lookbehind\0" +#define STRING_negative_lookahead0 "negative_lookahead\0" +#define STRING_negative_lookbehind0 "negative_lookbehind\0" +#define STRING_script_run0 "script_run\0" +#define STRING_atomic_script_run "atomic_script_run" + +#define STRING_alpha0 "alpha\0" +#define STRING_lower0 "lower\0" +#define STRING_upper0 "upper\0" +#define STRING_alnum0 "alnum\0" +#define STRING_ascii0 "ascii\0" +#define STRING_blank0 "blank\0" +#define STRING_cntrl0 "cntrl\0" +#define STRING_digit0 "digit\0" +#define STRING_graph0 "graph\0" +#define STRING_print0 "print\0" +#define STRING_punct0 "punct\0" +#define STRING_space0 "space\0" +#define STRING_word0 "word\0" +#define STRING_xdigit "xdigit" + +#define STRING_DEFINE "DEFINE" +#define STRING_VERSION "VERSION" +#define STRING_WEIRD_STARTWORD "[:<:]]" +#define STRING_WEIRD_ENDWORD "[:>:]]" + +#define STRING_CR_RIGHTPAR "CR)" +#define STRING_LF_RIGHTPAR "LF)" +#define STRING_CRLF_RIGHTPAR "CRLF)" +#define STRING_ANY_RIGHTPAR "ANY)" +#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" +#define STRING_NUL_RIGHTPAR "NUL)" +#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" +#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" +#define STRING_UTF8_RIGHTPAR "UTF8)" +#define STRING_UTF16_RIGHTPAR "UTF16)" +#define STRING_UTF32_RIGHTPAR "UTF32)" +#define STRING_UTF_RIGHTPAR "UTF)" +#define STRING_UCP_RIGHTPAR "UCP)" +#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" +#define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR "NO_DOTSTAR_ANCHOR)" +#define STRING_NO_JIT_RIGHTPAR "NO_JIT)" +#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" +#define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)" +#define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" +#define STRING_LIMIT_HEAP_EQ "LIMIT_HEAP=" +#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" +#define STRING_LIMIT_DEPTH_EQ "LIMIT_DEPTH=" +#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" +#define STRING_MARK "MARK" + +#define STRING_bc "bc" +#define STRING_bidiclass "bidiclass" +#define STRING_sc "sc" +#define STRING_script "script" +#define STRING_scriptextensions "scriptextensions" +#define STRING_scx "scx" + +#else /* SUPPORT_UNICODE */ + +/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This +works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode +only. */ + +#define CHAR_HT '\011' +#define CHAR_VT '\013' +#define CHAR_FF '\014' +#define CHAR_CR '\015' +#define CHAR_LF '\012' +#define CHAR_NL CHAR_LF +#define CHAR_NEL ((unsigned char)'\x85') +#define CHAR_BS '\010' +#define CHAR_BEL '\007' +#define CHAR_ESC '\033' +#define CHAR_DEL '\177' + +#define CHAR_NUL '\0' +#define CHAR_SPACE '\040' +#define CHAR_EXCLAMATION_MARK '\041' +#define CHAR_QUOTATION_MARK '\042' +#define CHAR_NUMBER_SIGN '\043' +#define CHAR_DOLLAR_SIGN '\044' +#define CHAR_PERCENT_SIGN '\045' +#define CHAR_AMPERSAND '\046' +#define CHAR_APOSTROPHE '\047' +#define CHAR_LEFT_PARENTHESIS '\050' +#define CHAR_RIGHT_PARENTHESIS '\051' +#define CHAR_ASTERISK '\052' +#define CHAR_PLUS '\053' +#define CHAR_COMMA '\054' +#define CHAR_MINUS '\055' +#define CHAR_DOT '\056' +#define CHAR_SLASH '\057' +#define CHAR_0 '\060' +#define CHAR_1 '\061' +#define CHAR_2 '\062' +#define CHAR_3 '\063' +#define CHAR_4 '\064' +#define CHAR_5 '\065' +#define CHAR_6 '\066' +#define CHAR_7 '\067' +#define CHAR_8 '\070' +#define CHAR_9 '\071' +#define CHAR_COLON '\072' +#define CHAR_SEMICOLON '\073' +#define CHAR_LESS_THAN_SIGN '\074' +#define CHAR_EQUALS_SIGN '\075' +#define CHAR_GREATER_THAN_SIGN '\076' +#define CHAR_QUESTION_MARK '\077' +#define CHAR_COMMERCIAL_AT '\100' +#define CHAR_A '\101' +#define CHAR_B '\102' +#define CHAR_C '\103' +#define CHAR_D '\104' +#define CHAR_E '\105' +#define CHAR_F '\106' +#define CHAR_G '\107' +#define CHAR_H '\110' +#define CHAR_I '\111' +#define CHAR_J '\112' +#define CHAR_K '\113' +#define CHAR_L '\114' +#define CHAR_M '\115' +#define CHAR_N '\116' +#define CHAR_O '\117' +#define CHAR_P '\120' +#define CHAR_Q '\121' +#define CHAR_R '\122' +#define CHAR_S '\123' +#define CHAR_T '\124' +#define CHAR_U '\125' +#define CHAR_V '\126' +#define CHAR_W '\127' +#define CHAR_X '\130' +#define CHAR_Y '\131' +#define CHAR_Z '\132' +#define CHAR_LEFT_SQUARE_BRACKET '\133' +#define CHAR_BACKSLASH '\134' +#define CHAR_RIGHT_SQUARE_BRACKET '\135' +#define CHAR_CIRCUMFLEX_ACCENT '\136' +#define CHAR_UNDERSCORE '\137' +#define CHAR_GRAVE_ACCENT '\140' +#define CHAR_a '\141' +#define CHAR_b '\142' +#define CHAR_c '\143' +#define CHAR_d '\144' +#define CHAR_e '\145' +#define CHAR_f '\146' +#define CHAR_g '\147' +#define CHAR_h '\150' +#define CHAR_i '\151' +#define CHAR_j '\152' +#define CHAR_k '\153' +#define CHAR_l '\154' +#define CHAR_m '\155' +#define CHAR_n '\156' +#define CHAR_o '\157' +#define CHAR_p '\160' +#define CHAR_q '\161' +#define CHAR_r '\162' +#define CHAR_s '\163' +#define CHAR_t '\164' +#define CHAR_u '\165' +#define CHAR_v '\166' +#define CHAR_w '\167' +#define CHAR_x '\170' +#define CHAR_y '\171' +#define CHAR_z '\172' +#define CHAR_LEFT_CURLY_BRACKET '\173' +#define CHAR_VERTICAL_LINE '\174' +#define CHAR_RIGHT_CURLY_BRACKET '\175' +#define CHAR_TILDE '\176' +#define CHAR_NBSP ((unsigned char)'\xa0') + +#define STR_HT "\011" +#define STR_VT "\013" +#define STR_FF "\014" +#define STR_CR "\015" +#define STR_NL "\012" +#define STR_BS "\010" +#define STR_BEL "\007" +#define STR_ESC "\033" +#define STR_DEL "\177" + +#define STR_SPACE "\040" +#define STR_EXCLAMATION_MARK "\041" +#define STR_QUOTATION_MARK "\042" +#define STR_NUMBER_SIGN "\043" +#define STR_DOLLAR_SIGN "\044" +#define STR_PERCENT_SIGN "\045" +#define STR_AMPERSAND "\046" +#define STR_APOSTROPHE "\047" +#define STR_LEFT_PARENTHESIS "\050" +#define STR_RIGHT_PARENTHESIS "\051" +#define STR_ASTERISK "\052" +#define STR_PLUS "\053" +#define STR_COMMA "\054" +#define STR_MINUS "\055" +#define STR_DOT "\056" +#define STR_SLASH "\057" +#define STR_0 "\060" +#define STR_1 "\061" +#define STR_2 "\062" +#define STR_3 "\063" +#define STR_4 "\064" +#define STR_5 "\065" +#define STR_6 "\066" +#define STR_7 "\067" +#define STR_8 "\070" +#define STR_9 "\071" +#define STR_COLON "\072" +#define STR_SEMICOLON "\073" +#define STR_LESS_THAN_SIGN "\074" +#define STR_EQUALS_SIGN "\075" +#define STR_GREATER_THAN_SIGN "\076" +#define STR_QUESTION_MARK "\077" +#define STR_COMMERCIAL_AT "\100" +#define STR_A "\101" +#define STR_B "\102" +#define STR_C "\103" +#define STR_D "\104" +#define STR_E "\105" +#define STR_F "\106" +#define STR_G "\107" +#define STR_H "\110" +#define STR_I "\111" +#define STR_J "\112" +#define STR_K "\113" +#define STR_L "\114" +#define STR_M "\115" +#define STR_N "\116" +#define STR_O "\117" +#define STR_P "\120" +#define STR_Q "\121" +#define STR_R "\122" +#define STR_S "\123" +#define STR_T "\124" +#define STR_U "\125" +#define STR_V "\126" +#define STR_W "\127" +#define STR_X "\130" +#define STR_Y "\131" +#define STR_Z "\132" +#define STR_LEFT_SQUARE_BRACKET "\133" +#define STR_BACKSLASH "\134" +#define STR_RIGHT_SQUARE_BRACKET "\135" +#define STR_CIRCUMFLEX_ACCENT "\136" +#define STR_UNDERSCORE "\137" +#define STR_GRAVE_ACCENT "\140" +#define STR_a "\141" +#define STR_b "\142" +#define STR_c "\143" +#define STR_d "\144" +#define STR_e "\145" +#define STR_f "\146" +#define STR_g "\147" +#define STR_h "\150" +#define STR_i "\151" +#define STR_j "\152" +#define STR_k "\153" +#define STR_l "\154" +#define STR_m "\155" +#define STR_n "\156" +#define STR_o "\157" +#define STR_p "\160" +#define STR_q "\161" +#define STR_r "\162" +#define STR_s "\163" +#define STR_t "\164" +#define STR_u "\165" +#define STR_v "\166" +#define STR_w "\167" +#define STR_x "\170" +#define STR_y "\171" +#define STR_z "\172" +#define STR_LEFT_CURLY_BRACKET "\173" +#define STR_VERTICAL_LINE "\174" +#define STR_RIGHT_CURLY_BRACKET "\175" +#define STR_TILDE "\176" + +#define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0" +#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0" +#define STRING_F0 STR_F "\0" +#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0" +#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0" +#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0" +#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0" +#define STRING_THEN STR_T STR_H STR_E STR_N + +#define STRING_atomic0 STR_a STR_t STR_o STR_m STR_i STR_c "\0" +#define STRING_pla0 STR_p STR_l STR_a "\0" +#define STRING_plb0 STR_p STR_l STR_b "\0" +#define STRING_napla0 STR_n STR_a STR_p STR_l STR_a "\0" +#define STRING_naplb0 STR_n STR_a STR_p STR_l STR_b "\0" +#define STRING_nla0 STR_n STR_l STR_a "\0" +#define STRING_nlb0 STR_n STR_l STR_b "\0" +#define STRING_sr0 STR_s STR_r "\0" +#define STRING_asr0 STR_a STR_s STR_r "\0" +#define STRING_positive_lookahead0 STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" +#define STRING_positive_lookbehind0 STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" +#define STRING_non_atomic_positive_lookahead0 STR_n STR_o STR_n STR_UNDERSCORE STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" +#define STRING_non_atomic_positive_lookbehind0 STR_n STR_o STR_n STR_UNDERSCORE STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" +#define STRING_negative_lookahead0 STR_n STR_e STR_g STR_a STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" +#define STRING_negative_lookbehind0 STR_n STR_e STR_g STR_a STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" +#define STRING_script_run0 STR_s STR_c STR_r STR_i STR_p STR_t STR_UNDERSCORE STR_r STR_u STR_n "\0" +#define STRING_atomic_script_run STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_s STR_c STR_r STR_i STR_p STR_t STR_UNDERSCORE STR_r STR_u STR_n + +#define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0" +#define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0" +#define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0" +#define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0" +#define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0" +#define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0" +#define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0" +#define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0" +#define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0" +#define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0" +#define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0" +#define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0" +#define STRING_word0 STR_w STR_o STR_r STR_d "\0" +#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t + +#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E +#define STRING_VERSION STR_V STR_E STR_R STR_S STR_I STR_O STR_N +#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET +#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET + +#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS +#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS +#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_NUL_RIGHTPAR STR_N STR_U STR_L STR_RIGHT_PARENTHESIS +#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS +#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS +#define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS +#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS +#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS +#define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS +#define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_D STR_O STR_T STR_S STR_T STR_A STR_R STR_UNDERSCORE STR_A STR_N STR_C STR_H STR_O STR_R STR_RIGHT_PARENTHESIS +#define STRING_NO_JIT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_J STR_I STR_T STR_RIGHT_PARENTHESIS +#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS +#define STRING_NOTEMPTY_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_RIGHT_PARENTHESIS +#define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS +#define STRING_LIMIT_HEAP_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_H STR_E STR_A STR_P STR_EQUALS_SIGN +#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN +#define STRING_LIMIT_DEPTH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_D STR_E STR_P STR_T STR_H STR_EQUALS_SIGN +#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN +#define STRING_MARK STR_M STR_A STR_R STR_K + +#define STRING_bc STR_b STR_c +#define STRING_bidiclass STR_b STR_i STR_d STR_i STR_c STR_l STR_a STR_s STR_s +#define STRING_sc STR_s STR_c +#define STRING_script STR_s STR_c STR_r STR_i STR_p STR_t +#define STRING_scriptextensions STR_s STR_c STR_r STR_i STR_p STR_t STR_e STR_x STR_t STR_e STR_n STR_s STR_i STR_o STR_n STR_s +#define STRING_scx STR_s STR_c STR_x + + +#endif /* SUPPORT_UNICODE */ + +/* -------------------- End of character and string names -------------------*/ + +/* -------------------- Definitions for compiled patterns -------------------*/ + +/* Codes for different types of Unicode property. If these definitions are +changed, the autopossessifying table in pcre2_auto_possess.c must be updated to +match. */ + +#define PT_ANY 0 /* Any property - matches all chars */ +#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ +#define PT_GC 2 /* Specified general characteristic (e.g. L) */ +#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */ +#define PT_SC 4 /* Script only (e.g. Han) */ +#define PT_SCX 5 /* Script extensions (includes SC) */ +#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */ +#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */ +#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */ +#define PT_WORD 9 /* Word - L plus N plus underscore */ +#define PT_CLIST 10 /* Pseudo-property: match character list */ +#define PT_UCNC 11 /* Universal Character nameable character */ +#define PT_BIDICL 12 /* Specified bidi class */ +#define PT_BOOL 13 /* Boolean property */ +#define PT_TABSIZE 14 /* Size of square table for autopossessify tests */ + +/* The following special properties are used only in XCLASS items, when POSIX +classes are specified and PCRE2_UCP is set - in other words, for Unicode +handling of these classes. They are not available via the \p or \P escapes like +those in the above list, and so they do not take part in the autopossessifying +table. */ + +#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */ +#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */ +#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */ + +/* This value is used when parsing \p and \P escapes to indicate that neither +\p{script:...} nor \p{scx:...} has been encountered. */ + +#define PT_NOTSCRIPT 255 + +/* Flag bits and data types for the extended class (OP_XCLASS) for classes that +contain characters with values greater than 255. */ + +#define XCL_NOT 0x01 /* Flag: this is a negative class */ +#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ +#define XCL_HASPROP 0x04 /* Flag: property checks are present. */ + +#define XCL_END 0 /* Marks end of individual items */ +#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ +#define XCL_RANGE 2 /* A range (two multibyte chars) follows */ +#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ +#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ + +/* These are escaped items that aren't just an encoding of a particular data +value such as \n. They must have non-zero values, as check_escape() returns 0 +for a data character. In the escapes[] table in pcre2_compile.c their values +are negated in order to distinguish them from data values. + +They must appear here in the same order as in the opcode definitions below, up +to ESC_z. There's a dummy for OP_ALLANY because it corresponds to "." in DOTALL +mode rather than an escape sequence. It is also used for [^] in JavaScript +compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves +like \N. + +Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in +check_escape(). There are tests in the code for an escape greater than ESC_b +and less than ESC_Z to detect the types that may be repeated. These are the +types that consume characters. If any new escapes are put in between that don't +consume a character, that code will have to change. */ + +enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, + ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, + ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, + ESC_E, ESC_Q, ESC_g, ESC_k }; + + +/********************** Opcode definitions ******************/ + +/****** NOTE NOTE NOTE ****** + +Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in +order to the list of escapes immediately above. Furthermore, values up to +OP_DOLLM must not be changed without adjusting the table called autoposstab in +pcre2_auto_possess.c. + +Whenever this list is updated, the two macro definitions that follow must be +updated to match. The possessification table called "opcode_possessify" in +pcre2_compile.c must also be updated, and also the tables called "coptable" +and "poptable" in pcre2_dfa_match.c. + +****** NOTE NOTE NOTE ******/ + + +/* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive, +are used in a table for deciding whether a repeated character type can be +auto-possessified. */ + +#define FIRST_AUTOTAB_OP OP_NOT_DIGIT +#define LAST_AUTOTAB_LEFT_OP OP_EXTUNI +#define LAST_AUTOTAB_RIGHT_OP OP_DOLLM + +enum { + OP_END, /* 0 End of pattern */ + + /* Values corresponding to backslashed metacharacters */ + + OP_SOD, /* 1 Start of data: \A */ + OP_SOM, /* 2 Start of match (subject + offset): \G */ + OP_SET_SOM, /* 3 Set start of match (\K) */ + OP_NOT_WORD_BOUNDARY, /* 4 \B */ + OP_WORD_BOUNDARY, /* 5 \b */ + OP_NOT_DIGIT, /* 6 \D */ + OP_DIGIT, /* 7 \d */ + OP_NOT_WHITESPACE, /* 8 \S */ + OP_WHITESPACE, /* 9 \s */ + OP_NOT_WORDCHAR, /* 10 \W */ + OP_WORDCHAR, /* 11 \w */ + + OP_ANY, /* 12 Match any character except newline (\N) */ + OP_ALLANY, /* 13 Match any character */ + OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ + OP_NOTPROP, /* 15 \P (not Unicode property) */ + OP_PROP, /* 16 \p (Unicode property) */ + OP_ANYNL, /* 17 \R (any newline sequence) */ + OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ + OP_HSPACE, /* 19 \h (horizontal whitespace) */ + OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ + OP_VSPACE, /* 21 \v (vertical whitespace) */ + OP_EXTUNI, /* 22 \X (extended Unicode sequence */ + OP_EODN, /* 23 End of data or \n at end of data (\Z) */ + OP_EOD, /* 24 End of data (\z) */ + + /* Line end assertions */ + + OP_DOLL, /* 25 End of line - not multiline */ + OP_DOLLM, /* 26 End of line - multiline */ + OP_CIRC, /* 27 Start of line - not multiline */ + OP_CIRCM, /* 28 Start of line - multiline */ + + /* Single characters; caseful must precede the caseless ones, and these + must remain in this order, and adjacent. */ + + OP_CHAR, /* 29 Match one character, casefully */ + OP_CHARI, /* 30 Match one character, caselessly */ + OP_NOT, /* 31 Match one character, not the given one, casefully */ + OP_NOTI, /* 32 Match one character, not the given one, caselessly */ + + /* The following sets of 13 opcodes must always be kept in step because + the offset from the first one is used to generate the others. */ + + /* Repeated characters; caseful must precede the caseless ones */ + + OP_STAR, /* 33 The maximizing and minimizing versions of */ + OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */ + OP_PLUS, /* 35 the minimizing one second. */ + OP_MINPLUS, /* 36 */ + OP_QUERY, /* 37 */ + OP_MINQUERY, /* 38 */ + + OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/ + OP_MINUPTO, /* 40 */ + OP_EXACT, /* 41 Exactly n matches */ + + OP_POSSTAR, /* 42 Possessified star, caseful */ + OP_POSPLUS, /* 43 Possessified plus, caseful */ + OP_POSQUERY, /* 44 Posesssified query, caseful */ + OP_POSUPTO, /* 45 Possessified upto, caseful */ + + /* Repeated characters; caseless must follow the caseful ones */ + + OP_STARI, /* 46 */ + OP_MINSTARI, /* 47 */ + OP_PLUSI, /* 48 */ + OP_MINPLUSI, /* 49 */ + OP_QUERYI, /* 50 */ + OP_MINQUERYI, /* 51 */ + + OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */ + OP_MINUPTOI, /* 53 */ + OP_EXACTI, /* 54 */ + + OP_POSSTARI, /* 55 Possessified star, caseless */ + OP_POSPLUSI, /* 56 Possessified plus, caseless */ + OP_POSQUERYI, /* 57 Posesssified query, caseless */ + OP_POSUPTOI, /* 58 Possessified upto, caseless */ + + /* The negated ones must follow the non-negated ones, and match them */ + /* Negated repeated character, caseful; must precede the caseless ones */ + + OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */ + OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */ + OP_NOTPLUS, /* 61 the minimizing one second. They must be in */ + OP_NOTMINPLUS, /* 62 exactly the same order as those above. */ + OP_NOTQUERY, /* 63 */ + OP_NOTMINQUERY, /* 64 */ + + OP_NOTUPTO, /* 65 From 0 to n matches, caseful */ + OP_NOTMINUPTO, /* 66 */ + OP_NOTEXACT, /* 67 Exactly n matches */ + + OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */ + OP_NOTPOSPLUS, /* 69 */ + OP_NOTPOSQUERY, /* 70 */ + OP_NOTPOSUPTO, /* 71 */ + + /* Negated repeated character, caseless; must follow the caseful ones */ + + OP_NOTSTARI, /* 72 */ + OP_NOTMINSTARI, /* 73 */ + OP_NOTPLUSI, /* 74 */ + OP_NOTMINPLUSI, /* 75 */ + OP_NOTQUERYI, /* 76 */ + OP_NOTMINQUERYI, /* 77 */ + + OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */ + OP_NOTMINUPTOI, /* 79 */ + OP_NOTEXACTI, /* 80 Exactly n matches */ + + OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */ + OP_NOTPOSPLUSI, /* 82 */ + OP_NOTPOSQUERYI, /* 83 */ + OP_NOTPOSUPTOI, /* 84 */ + + /* Character types */ + + OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */ + OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */ + OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */ + OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */ + OP_TYPEQUERY, /* 89 */ + OP_TYPEMINQUERY, /* 90 */ + + OP_TYPEUPTO, /* 91 From 0 to n matches */ + OP_TYPEMINUPTO, /* 92 */ + OP_TYPEEXACT, /* 93 Exactly n matches */ + + OP_TYPEPOSSTAR, /* 94 Possessified versions */ + OP_TYPEPOSPLUS, /* 95 */ + OP_TYPEPOSQUERY, /* 96 */ + OP_TYPEPOSUPTO, /* 97 */ + + /* These are used for character classes and back references; only the + first six are the same as the sets above. */ + + OP_CRSTAR, /* 98 The maximizing and minimizing versions of */ + OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */ + OP_CRPLUS, /* 100 the minimizing one second. These codes must */ + OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */ + OP_CRQUERY, /* 102 */ + OP_CRMINQUERY, /* 103 */ + + OP_CRRANGE, /* 104 These are different to the three sets above. */ + OP_CRMINRANGE, /* 105 */ + + OP_CRPOSSTAR, /* 106 Possessified versions */ + OP_CRPOSPLUS, /* 107 */ + OP_CRPOSQUERY, /* 108 */ + OP_CRPOSRANGE, /* 109 */ + + /* End of quantifier opcodes */ + + OP_CLASS, /* 110 Match a character class, chars < 256 only */ + OP_NCLASS, /* 111 Same, but the bitmap was created from a negative + class - the difference is relevant only when a + character > 255 is encountered. */ + OP_XCLASS, /* 112 Extended class for handling > 255 chars within the + class. This does both positive and negative. */ + OP_REF, /* 113 Match a back reference, casefully */ + OP_REFI, /* 114 Match a back reference, caselessly */ + OP_DNREF, /* 115 Match a duplicate name backref, casefully */ + OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */ + OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */ + OP_CALLOUT, /* 118 Call out to external function if provided */ + OP_CALLOUT_STR, /* 119 Call out with string argument */ + + OP_ALT, /* 120 Start of alternation */ + OP_KET, /* 121 End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* 122 These two must remain together and in this */ + OP_KETRMIN, /* 123 order. They are for groups the repeat for ever. */ + OP_KETRPOS, /* 124 Possessive unlimited repeat. */ + + /* The assertions must come before BRA, CBRA, ONCE, and COND. */ + + OP_REVERSE, /* 125 Move pointer back - used in lookbehind assertions */ + OP_ASSERT, /* 126 Positive lookahead */ + OP_ASSERT_NOT, /* 127 Negative lookahead */ + OP_ASSERTBACK, /* 128 Positive lookbehind */ + OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */ + OP_ASSERT_NA, /* 130 Positive non-atomic lookahead */ + OP_ASSERTBACK_NA, /* 131 Positive non-atomic lookbehind */ + + /* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come + immediately after the assertions, with ONCE first, as there's a test for >= + ONCE for a subpattern that isn't an assertion. The POS versions must + immediately follow the non-POS versions in each case. */ + + OP_ONCE, /* 132 Atomic group, contains captures */ + OP_SCRIPT_RUN, /* 133 Non-capture, but check characters' scripts */ + OP_BRA, /* 134 Start of non-capturing bracket */ + OP_BRAPOS, /* 135 Ditto, with unlimited, possessive repeat */ + OP_CBRA, /* 136 Start of capturing bracket */ + OP_CBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ + OP_COND, /* 138 Conditional group */ + + /* These five must follow the previous five, in the same order. There's a + check for >= SBRA to distinguish the two sets. */ + + OP_SBRA, /* 139 Start of non-capturing bracket, check empty */ + OP_SBRAPOS, /* 149 Ditto, with unlimited, possessive repeat */ + OP_SCBRA, /* 141 Start of capturing bracket, check empty */ + OP_SCBRAPOS, /* 142 Ditto, with unlimited, possessive repeat */ + OP_SCOND, /* 143 Conditional group, check empty */ + + /* The next two pairs must (respectively) be kept together. */ + + OP_CREF, /* 144 Used to hold a capture number as condition */ + OP_DNCREF, /* 145 Used to point to duplicate names as a condition */ + OP_RREF, /* 146 Used to hold a recursion number as condition */ + OP_DNRREF, /* 147 Used to point to duplicate names as a condition */ + OP_FALSE, /* 148 Always false (used by DEFINE and VERSION) */ + OP_TRUE, /* 149 Always true (used by VERSION) */ + + OP_BRAZERO, /* 150 These two must remain together and in this */ + OP_BRAMINZERO, /* 151 order. */ + OP_BRAPOSZERO, /* 152 */ + + /* These are backtracking control verbs */ + + OP_MARK, /* 153 always has an argument */ + OP_PRUNE, /* 154 */ + OP_PRUNE_ARG, /* 155 same, but with argument */ + OP_SKIP, /* 156 */ + OP_SKIP_ARG, /* 157 same, but with argument */ + OP_THEN, /* 158 */ + OP_THEN_ARG, /* 159 same, but with argument */ + OP_COMMIT, /* 160 */ + OP_COMMIT_ARG, /* 161 same, but with argument */ + + /* These are forced failure and success verbs. FAIL and ACCEPT do accept an + argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL) + without the need for a special opcode. */ + + OP_FAIL, /* 162 */ + OP_ACCEPT, /* 163 */ + OP_ASSERT_ACCEPT, /* 164 Used inside assertions */ + OP_CLOSE, /* 165 Used before OP_ACCEPT to close open captures */ + + /* This is used to skip a subpattern with a {0} quantifier */ + + OP_SKIPZERO, /* 166 */ + + /* This is used to identify a DEFINE group during compilation so that it can + be checked for having only one branch. It is changed to OP_FALSE before + compilation finishes. */ + + OP_DEFINE, /* 167 */ + + /* This is not an opcode, but is used to check that tables indexed by opcode + are the correct length, in order to catch updating errors - there have been + some in the past. */ + + OP_TABLE_LENGTH + +}; + +/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro +definitions that follow must also be updated to match. There are also tables +called "opcode_possessify" in pcre2_compile.c and "coptable" and "poptable" in +pcre2_dfa_match.c that must be updated. */ + + +/* This macro defines textual names for all the opcodes. These are used only +for debugging, and some of them are only partial names. The macro is referenced +only in pcre2_printint.c, which fills out the full names in many cases (and in +some cases doesn't actually use these names at all). */ + +#define OP_NAME_LIST \ + "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ + "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ + "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ + "extuni", "\\Z", "\\z", \ + "$", "$", "^", "^", "char", "chari", "not", "noti", \ + "*", "*?", "+", "+?", "?", "??", \ + "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", \ + "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", \ + "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", \ + "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ + "*+","++", "?+", "{", \ + "*", "*?", "+", "+?", "?", "??", "{", "{", \ + "*+","++", "?+", "{", \ + "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \ + "Recurse", "Callout", "CalloutStr", \ + "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ + "Reverse", "Assert", "Assert not", \ + "Assert back", "Assert back not", \ + "Non-atomic assert", "Non-atomic assert back", \ + "Once", \ + "Script run", \ + "Bra", "BraPos", "CBra", "CBraPos", \ + "Cond", \ + "SBra", "SBraPos", "SCBra", "SCBraPos", \ + "SCond", \ + "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", \ + "Cond false", "Cond true", \ + "Brazero", "Braminzero", "Braposzero", \ + "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ + "*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \ + "*ACCEPT", "*ASSERT_ACCEPT", \ + "Close", "Skip zero", "Define" + + +/* This macro defines the length of fixed length operations in the compiled +regex. The lengths are used when searching for specific things, and also in the +debugging printing of a compiled regex. We use a macro so that it can be +defined close to the definitions of the opcodes themselves. + +As things have been extended, some of these are no longer fixed lenths, but are +minima instead. For example, the length of a single-character repeat may vary +in UTF-8 mode. The code that uses this table must know about such things. */ + +#define OP_LENGTHS \ + 1, /* End */ \ + 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ + 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ + 1, 1, 1, /* Any, AllAny, Anybyte */ \ + 3, 3, /* \P, \p */ \ + 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ + 1, /* \X */ \ + 1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \ + 2, /* Char - the minimum length */ \ + 2, /* Chari - the minimum length */ \ + 2, /* not */ \ + 2, /* noti */ \ + /* Positive single-char repeats ** These are */ \ + 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto, minupto ** mode */ \ + 2+IMM2_SIZE, /* exact */ \ + 2, 2, 2, 2+IMM2_SIZE, /* *+, ++, ?+, upto+ */ \ + 2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto I, minupto I */ \ + 2+IMM2_SIZE, /* exact I */ \ + 2, 2, 2, 2+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ \ + /* Negative single-char repeats - only for chars < 256 */ \ + 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto, minupto */ \ + 2+IMM2_SIZE, /* NOT exact */ \ + 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *, +, ?, upto */ \ + 2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto I, minupto I */ \ + 2+IMM2_SIZE, /* NOT exact I */ \ + 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *I, +I, ?I, upto I */ \ + /* Positive type repeats */ \ + 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* Type upto, minupto */ \ + 2+IMM2_SIZE, /* Type exact */ \ + 2, 2, 2, 2+IMM2_SIZE, /* Possessive *+, ++, ?+, upto+ */ \ + /* Character class & ref repeats */ \ + 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ + 1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ + 1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \ + 1+(32/sizeof(PCRE2_UCHAR)), /* CLASS */ \ + 1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \ + 0, /* XCLASS - variable length */ \ + 1+IMM2_SIZE, /* REF */ \ + 1+IMM2_SIZE, /* REFI */ \ + 1+2*IMM2_SIZE, /* DNREF */ \ + 1+2*IMM2_SIZE, /* DNREFI */ \ + 1+LINK_SIZE, /* RECURSE */ \ + 1+2*LINK_SIZE+1, /* CALLOUT */ \ + 0, /* CALLOUT_STR - variable length */ \ + 1+LINK_SIZE, /* Alt */ \ + 1+LINK_SIZE, /* Ket */ \ + 1+LINK_SIZE, /* KetRmax */ \ + 1+LINK_SIZE, /* KetRmin */ \ + 1+LINK_SIZE, /* KetRpos */ \ + 1+LINK_SIZE, /* Reverse */ \ + 1+LINK_SIZE, /* Assert */ \ + 1+LINK_SIZE, /* Assert not */ \ + 1+LINK_SIZE, /* Assert behind */ \ + 1+LINK_SIZE, /* Assert behind not */ \ + 1+LINK_SIZE, /* NA Assert */ \ + 1+LINK_SIZE, /* NA Assert behind */ \ + 1+LINK_SIZE, /* ONCE */ \ + 1+LINK_SIZE, /* SCRIPT_RUN */ \ + 1+LINK_SIZE, /* BRA */ \ + 1+LINK_SIZE, /* BRAPOS */ \ + 1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \ + 1+LINK_SIZE+IMM2_SIZE, /* CBRAPOS */ \ + 1+LINK_SIZE, /* COND */ \ + 1+LINK_SIZE, /* SBRA */ \ + 1+LINK_SIZE, /* SBRAPOS */ \ + 1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \ + 1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \ + 1+LINK_SIZE, /* SCOND */ \ + 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \ + 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \ + 1, 1, /* FALSE, TRUE */ \ + 1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ + 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ + 1, 3, /* SKIP, SKIP_ARG */ \ + 1, 3, /* THEN, THEN_ARG */ \ + 1, 3, /* COMMIT, COMMIT_ARG */ \ + 1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \ + 1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \ + 1 /* DEFINE */ + +/* A magic value for OP_RREF to indicate the "any recursion" condition. */ + +#define RREF_ANY 0xffff + + +/* ---------- Private structures that are mode-independent. ---------- */ + +/* Structure to hold data for custom memory management. */ + +typedef struct pcre2_memctl { + void * (*malloc)(size_t, void *); + void (*free)(void *, void *); + void *memory_data; +} pcre2_memctl; + +/* Structure for building a chain of open capturing subpatterns during +compiling, so that instructions to close them can be compiled when (*ACCEPT) is +encountered. */ + +typedef struct open_capitem { + struct open_capitem *next; /* Chain link */ + uint16_t number; /* Capture number */ + uint16_t assert_depth; /* Assertion depth when opened */ +} open_capitem; + +/* Layout of the UCP type table that translates property names into types and +codes. Each entry used to point directly to a name, but to reduce the number of +relocations in shared libraries, it now has an offset into a single string +instead. */ + +typedef struct { + uint16_t name_offset; + uint16_t type; + uint16_t value; +} ucp_type_table; + +/* Unicode character database (UCD) record format */ + +typedef struct { + uint8_t script; /* ucp_Arabic, etc. */ + uint8_t chartype; /* ucp_Cc, etc. (general categories) */ + uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */ + uint8_t caseset; /* offset to multichar other cases or zero */ + int32_t other_case; /* offset to other case, or zero if none */ + uint16_t scriptx_bidiclass; /* script extension (11 bit) and bidi class (5 bit) values */ + uint16_t bprops; /* binary properties offset */ +} ucd_record; + +/* UCD access macros */ + +#define UCD_BLOCK_SIZE 128 +#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \ + PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \ + UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE]) + +#if PCRE2_CODE_UNIT_WIDTH == 32 +#define GET_UCD(ch) ((ch > MAX_UTF_CODE_POINT)? \ + PRIV(dummy_ucd_record) : REAL_GET_UCD(ch)) +#else +#define GET_UCD(ch) REAL_GET_UCD(ch) +#endif + +#define UCD_SCRIPTX_MASK 0x3ff +#define UCD_BIDICLASS_SHIFT 11 +#define UCD_BPROPS_MASK 0xfff + +#define UCD_SCRIPTX_PROP(prop) ((prop)->scriptx_bidiclass & UCD_SCRIPTX_MASK) +#define UCD_BIDICLASS_PROP(prop) ((prop)->scriptx_bidiclass >> UCD_BIDICLASS_SHIFT) +#define UCD_BPROPS_PROP(prop) ((prop)->bprops & UCD_BPROPS_MASK) + +#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype +#define UCD_SCRIPT(ch) GET_UCD(ch)->script +#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] +#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop +#define UCD_CASESET(ch) GET_UCD(ch)->caseset +#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case))) +#define UCD_SCRIPTX(ch) UCD_SCRIPTX_PROP(GET_UCD(ch)) +#define UCD_BPROPS(ch) UCD_BPROPS_PROP(GET_UCD(ch)) +#define UCD_BIDICLASS(ch) UCD_BIDICLASS_PROP(GET_UCD(ch)) + +/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words +that form a bitmap representing a list of scripts or boolean properties. These +macros test or set a bit in the map by number. */ + +#define MAPBIT(map,n) ((map)[(n)/32]&(1u<<((n)%32))) +#define MAPSET(map,n) ((map)[(n)/32]|=(1u<<((n)%32))) + +/* Header for serialized pcre2 codes. */ + +typedef struct pcre2_serialized_data { + uint32_t magic; + uint32_t version; + uint32_t config; + int32_t number_of_codes; +} pcre2_serialized_data; + + + +/* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */ + +/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is defined as +0, so the following items are omitted. */ + +#if defined PCRE2_CODE_UNIT_WIDTH && PCRE2_CODE_UNIT_WIDTH != 0 + +/* EBCDIC is supported only for the 8-bit library. */ + +#if defined EBCDIC && PCRE2_CODE_UNIT_WIDTH != 8 +#error EBCDIC is not supported for the 16-bit or 32-bit libraries +#endif + +/* This is the largest non-UTF code point. */ + +#define MAX_NON_UTF_CHAR (0xffffffffU >> (32 - PCRE2_CODE_UNIT_WIDTH)) + +/* Internal shared data tables and variables. These are used by more than one +of the exported public functions. They have to be "external" in the C sense, +but are not part of the PCRE2 public API. Although the data for some of them is +identical in all libraries, they must have different names so that multiple +libraries can be simultaneously linked to a single application. However, UTF-8 +tables are needed only when compiling the 8-bit library. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +extern const int PRIV(utf8_table1)[]; +extern const int PRIV(utf8_table1_size); +extern const int PRIV(utf8_table2)[]; +extern const int PRIV(utf8_table3)[]; +extern const uint8_t PRIV(utf8_table4)[]; +#endif + +#define _pcre2_OP_lengths PCRE2_SUFFIX(_pcre2_OP_lengths_) +#define _pcre2_callout_end_delims PCRE2_SUFFIX(_pcre2_callout_end_delims_) +#define _pcre2_callout_start_delims PCRE2_SUFFIX(_pcre2_callout_start_delims_) +#define _pcre2_default_compile_context PCRE2_SUFFIX(_pcre2_default_compile_context_) +#define _pcre2_default_convert_context PCRE2_SUFFIX(_pcre2_default_convert_context_) +#define _pcre2_default_match_context PCRE2_SUFFIX(_pcre2_default_match_context_) +#define _pcre2_default_tables PCRE2_SUFFIX(_pcre2_default_tables_) +#if PCRE2_CODE_UNIT_WIDTH == 32 +#define _pcre2_dummy_ucd_record PCRE2_SUFFIX(_pcre2_dummy_ucd_record_) +#endif +#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_) +#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_) +#define _pcre2_ucd_boolprop_sets PCRE2_SUFFIX(_pcre2_ucd_boolprop_sets_) +#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_) +#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_) +#define _pcre2_ucd_script_sets PCRE2_SUFFIX(_pcre2_ucd_script_sets_) +#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_) +#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_) +#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_) +#define _pcre2_ucp_gbtable PCRE2_SUFFIX(_pcre2_ucp_gbtable_) +#define _pcre2_ucp_gentype PCRE2_SUFFIX(_pcre2_ucp_gentype_) +#define _pcre2_ucp_typerange PCRE2_SUFFIX(_pcre2_ucp_typerange_) +#define _pcre2_unicode_version PCRE2_SUFFIX(_pcre2_unicode_version_) +#define _pcre2_utt PCRE2_SUFFIX(_pcre2_utt_) +#define _pcre2_utt_names PCRE2_SUFFIX(_pcre2_utt_names_) +#define _pcre2_utt_size PCRE2_SUFFIX(_pcre2_utt_size_) + +extern const uint8_t PRIV(OP_lengths)[]; +extern const uint32_t PRIV(callout_end_delims)[]; +extern const uint32_t PRIV(callout_start_delims)[]; +extern const pcre2_compile_context PRIV(default_compile_context); +extern const pcre2_convert_context PRIV(default_convert_context); +extern const pcre2_match_context PRIV(default_match_context); +extern const uint8_t PRIV(default_tables)[]; +extern const uint32_t PRIV(hspace_list)[]; +extern const uint32_t PRIV(vspace_list)[]; +extern const uint32_t PRIV(ucd_boolprop_sets)[]; +extern const uint32_t PRIV(ucd_caseless_sets)[]; +extern const uint32_t PRIV(ucd_digit_sets)[]; +extern const uint32_t PRIV(ucd_script_sets)[]; +extern const ucd_record PRIV(ucd_records)[]; +#if PCRE2_CODE_UNIT_WIDTH == 32 +extern const ucd_record PRIV(dummy_ucd_record)[]; +#endif +extern const uint16_t PRIV(ucd_stage1)[]; +extern const uint16_t PRIV(ucd_stage2)[]; +extern const uint32_t PRIV(ucp_gbtable)[]; +extern const uint32_t PRIV(ucp_gentype)[]; +#ifdef SUPPORT_JIT +extern const int PRIV(ucp_typerange)[]; +#endif +extern const char *PRIV(unicode_version); +extern const ucp_type_table PRIV(utt)[]; +extern const char PRIV(utt_names)[]; +extern const size_t PRIV(utt_size); + +/* Mode-dependent macros and hidden and private structures are defined in a +separate file so that pcre2test can include them at all supported widths. When +compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we can +include them at the appropriate width, after setting up suffix macros for the +private structures. */ + +#define branch_chain PCRE2_SUFFIX(branch_chain_) +#define compile_block PCRE2_SUFFIX(compile_block_) +#define dfa_match_block PCRE2_SUFFIX(dfa_match_block_) +#define match_block PCRE2_SUFFIX(match_block_) +#define named_group PCRE2_SUFFIX(named_group_) + +#include "regexp/pcre2/pcre2_intmodedep.h" + +/* Private "external" functions. These are internal functions that are called +from modules other than the one in which they are defined. They have to be +"external" in the C sense, but are not part of the PCRE2 public API. They are +not referenced from pcre2test, and must not be defined when no code unit width +is available. */ + +#define _pcre2_auto_possessify PCRE2_SUFFIX(_pcre2_auto_possessify_) +#define _pcre2_check_escape PCRE2_SUFFIX(_pcre2_check_escape_) +#define _pcre2_extuni PCRE2_SUFFIX(_pcre2_extuni_) +#define _pcre2_find_bracket PCRE2_SUFFIX(_pcre2_find_bracket_) +#define _pcre2_is_newline PCRE2_SUFFIX(_pcre2_is_newline_) +#define _pcre2_jit_free_rodata PCRE2_SUFFIX(_pcre2_jit_free_rodata_) +#define _pcre2_jit_free PCRE2_SUFFIX(_pcre2_jit_free_) +#define _pcre2_jit_get_size PCRE2_SUFFIX(_pcre2_jit_get_size_) +#define _pcre2_jit_get_target PCRE2_SUFFIX(_pcre2_jit_get_target_) +#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_) +#define _pcre2_ord2utf PCRE2_SUFFIX(_pcre2_ord2utf_) +#define _pcre2_script_run PCRE2_SUFFIX(_pcre2_script_run_) +#define _pcre2_strcmp PCRE2_SUFFIX(_pcre2_strcmp_) +#define _pcre2_strcmp_c8 PCRE2_SUFFIX(_pcre2_strcmp_c8_) +#define _pcre2_strcpy_c8 PCRE2_SUFFIX(_pcre2_strcpy_c8_) +#define _pcre2_strlen PCRE2_SUFFIX(_pcre2_strlen_) +#define _pcre2_strncmp PCRE2_SUFFIX(_pcre2_strncmp_) +#define _pcre2_strncmp_c8 PCRE2_SUFFIX(_pcre2_strncmp_c8_) +#define _pcre2_study PCRE2_SUFFIX(_pcre2_study_) +#define _pcre2_valid_utf PCRE2_SUFFIX(_pcre2_valid_utf_) +#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) +#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) + +extern int _pcre2_auto_possessify(PCRE2_UCHAR *, + const compile_block *); +extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, + int *, uint32_t, uint32_t, BOOL, compile_block *); +extern PCRE2_SPTR _pcre2_extuni(uint32_t, PCRE2_SPTR, PCRE2_SPTR, PCRE2_SPTR, + BOOL, int *); +extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); +extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, + uint32_t *, BOOL); +extern void _pcre2_jit_free_rodata(void *, void *); +extern void _pcre2_jit_free(void *, pcre2_memctl *); +extern size_t _pcre2_jit_get_size(void *); +const char * _pcre2_jit_get_target(void); +extern void * _pcre2_memctl_malloc(size_t, pcre2_memctl *); +extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *); +extern BOOL _pcre2_script_run(PCRE2_SPTR, PCRE2_SPTR, BOOL); +extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR); +extern int _pcre2_strcmp_c8(PCRE2_SPTR, const char *); +extern PCRE2_SIZE _pcre2_strcpy_c8(PCRE2_UCHAR *, const char *); +extern PCRE2_SIZE _pcre2_strlen(PCRE2_SPTR); +extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t); +extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t); +extern int _pcre2_study(pcre2_real_code *); +extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *); +extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, + uint32_t *, BOOL); +extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL); + +/* This function is needed only when memmove() is not available. */ + +#if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE) +#define _pcre2_memmove PCRE2_SUFFIX(_pcre2_memmove) +extern void * _pcre2_memmove(void *, const void *, size_t); +#endif + +#endif /* PCRE2_CODE_UNIT_WIDTH */ +#endif /* PCRE2_INTERNAL_H_IDEMPOTENT_GUARD */ + +/* End of pcre2_internal.h */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_intmodedep.h b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_intmodedep.h new file mode 100644 index 0000000000..007e4c0139 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_intmodedep.h @@ -0,0 +1,934 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains mode-dependent macro and structure definitions. The +file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined. +These mode-dependent items are kept in a separate file so that they can also be +#included multiple times for different code unit widths by pcre2test in order +to have access to the hidden structures at all supported widths. + +Some of the mode-dependent macros are required at different widths for +different parts of the pcre2test code (in particular, the included +pcre_printint.c file). We undefine them here so that they can be re-defined for +multiple inclusions. Not all of these are used in pcre2test, but it's easier +just to undefine them all. */ + +#undef ACROSSCHAR +#undef BACKCHAR +#undef BYTES2CU +#undef CHMAX_255 +#undef CU2BYTES +#undef FORWARDCHAR +#undef FORWARDCHARTEST +#undef GET +#undef GET2 +#undef GETCHAR +#undef GETCHARINC +#undef GETCHARINCTEST +#undef GETCHARLEN +#undef GETCHARLENTEST +#undef GETCHARTEST +#undef GET_EXTRALEN +#undef HAS_EXTRALEN +#undef IMM2_SIZE +#undef MAX_255 +#undef MAX_MARK +#undef MAX_PATTERN_SIZE +#undef MAX_UTF_SINGLE_CU +#undef NOT_FIRSTCU +#undef PUT +#undef PUT2 +#undef PUT2INC +#undef PUTCHAR +#undef PUTINC +#undef TABLE_GET + + + +/* -------------------------- MACROS ----------------------------- */ + +/* PCRE keeps offsets in its compiled code as at least 16-bit quantities +(always stored in big-endian order in 8-bit mode) by default. These are used, +for example, to link from the start of a subpattern to its alternatives and its +end. The use of 16 bits per offset limits the size of an 8-bit compiled regex +to around 64K, which is big enough for almost everybody. However, I received a +request for an even bigger limit. For this reason, and also to make the code +easier to maintain, the storing and loading of offsets from the compiled code +unit string is now handled by the macros that are defined here. + +The macros are controlled by the value of LINK_SIZE. This defaults to 2, but +values of 3 or 4 are also supported. */ + +/* ------------------- 8-bit support ------------------ */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + +#if LINK_SIZE == 2 +#define PUT(a,n,d) \ + (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) +#define GET(a,n) \ + (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) +#define MAX_PATTERN_SIZE (1 << 16) + +#elif LINK_SIZE == 3 +#define PUT(a,n,d) \ + (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ + (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) +#define GET(a,n) \ + (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) +#define MAX_PATTERN_SIZE (1 << 24) + +#elif LINK_SIZE == 4 +#define PUT(a,n,d) \ + (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ + (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ + (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) +#define GET(a,n) \ + (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) +#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ + +#else +#error LINK_SIZE must be 2, 3, or 4 +#endif + + +/* ------------------- 16-bit support ------------------ */ + +#elif PCRE2_CODE_UNIT_WIDTH == 16 + +#if LINK_SIZE == 2 +#undef LINK_SIZE +#define LINK_SIZE 1 +#define PUT(a,n,d) \ + (a[n] = (PCRE2_UCHAR)(d)) +#define GET(a,n) \ + (a[n]) +#define MAX_PATTERN_SIZE (1 << 16) + +#elif LINK_SIZE == 3 || LINK_SIZE == 4 +#undef LINK_SIZE +#define LINK_SIZE 2 +#define PUT(a,n,d) \ + (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) +#define GET(a,n) \ + (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) +#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ + +#else +#error LINK_SIZE must be 2, 3, or 4 +#endif + + +/* ------------------- 32-bit support ------------------ */ + +#elif PCRE2_CODE_UNIT_WIDTH == 32 +#undef LINK_SIZE +#define LINK_SIZE 1 +#define PUT(a,n,d) \ + (a[n] = (d)) +#define GET(a,n) \ + (a[n]) +#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ + +#else +#error Unsupported compiling mode +#endif + + +/* --------------- Other mode-specific macros ----------------- */ + +/* PCRE uses some other (at least) 16-bit quantities that do not change when +the size of offsets changes. There are used for repeat counts and for other +things such as capturing parenthesis numbers in back references. + +Define the number of code units required to hold a 16-bit count/offset, and +macros to load and store such a value. For reasons that I do not understand, +the expression in the 8-bit GET2 macro is treated by gcc as a signed +expression, even when a is declared as unsigned. It seems that any kind of +arithmetic results in a signed value. Hence the cast. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define IMM2_SIZE 2 +#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) +#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255 + +#else /* Code units are 16 or 32 bits */ +#define IMM2_SIZE 1 +#define GET2(a,n) a[n] +#define PUT2(a,n,d) a[n] = d +#endif + +/* Other macros that are different for 8-bit mode. The MAX_255 macro checks +whether its argument, which is assumed to be one code unit, is less than 256. +The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK +name must fit in one code unit; currently it is set to 255 or 65535. The +TABLE_GET macro is used to access elements of tables containing exactly 256 +items. Its argument is a code unit. When code points can be greater than 255, a +check is needed before accessing these tables. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define MAX_255(c) TRUE +#define MAX_MARK ((1u << 8) - 1) +#define TABLE_GET(c, table, default) ((table)[c]) +#ifdef SUPPORT_UNICODE +#define SUPPORT_WIDE_CHARS +#define CHMAX_255(c) ((c) <= 255u) +#else +#define CHMAX_255(c) TRUE +#endif /* SUPPORT_UNICODE */ + +#else /* Code units are 16 or 32 bits */ +#define CHMAX_255(c) ((c) <= 255u) +#define MAX_255(c) ((c) <= 255u) +#define MAX_MARK ((1u << 16) - 1) +#define SUPPORT_WIDE_CHARS +#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) +#endif + + +/* ----------------- Character-handling macros ----------------- */ + +/* There is a proposed future special "UTF-21" mode, in which only the lowest +21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 +high-order bits available to the application for other uses. In preparation for +the future implementation of this mode, there are macros that load a data item +and, if in this special mode, mask it to 21 bits. These macros all have names +starting with UCHAR21. In all other modes, including the normal 32-bit +library, the macros all have the same simple definitions. When the new mode is +implemented, it is expected that these definitions will be varied appropriately +using #ifdef when compiling the library that supports the special mode. */ + +#define UCHAR21(eptr) (*(eptr)) +#define UCHAR21TEST(eptr) (*(eptr)) +#define UCHAR21INC(eptr) (*(eptr)++) +#define UCHAR21INCTEST(eptr) (*(eptr)++) + +/* When UTF encoding is being used, a character is no longer just a single +byte in 8-bit mode or a single short in 16-bit mode. The macros for character +handling generate simple sequences when used in the basic mode, and more +complicated ones for UTF characters. GETCHARLENTEST and other macros are not +used when UTF is not supported. To make sure they can never even appear when +UTF support is omitted, we don't even define them. */ + +#ifndef SUPPORT_UNICODE + +/* #define MAX_UTF_SINGLE_CU */ +/* #define HAS_EXTRALEN(c) */ +/* #define GET_EXTRALEN(c) */ +/* #define NOT_FIRSTCU(c) */ +#define GETCHAR(c, eptr) c = *eptr; +#define GETCHARTEST(c, eptr) c = *eptr; +#define GETCHARINC(c, eptr) c = *eptr++; +#define GETCHARINCTEST(c, eptr) c = *eptr++; +#define GETCHARLEN(c, eptr, len) c = *eptr; +#define PUTCHAR(c, p) (*p = c, 1) +/* #define GETCHARLENTEST(c, eptr, len) */ +/* #define BACKCHAR(eptr) */ +/* #define FORWARDCHAR(eptr) */ +/* #define FORWARCCHARTEST(eptr,end) */ +/* #define ACROSSCHAR(condition, eptr, action) */ + +#else /* SUPPORT_UNICODE */ + +/* ------------------- 8-bit support ------------------ */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ + +/* The largest UTF code point that can be encoded as a single code unit. */ + +#define MAX_UTF_SINGLE_CU 127 + +/* Tests whether the code point needs extra characters to decode. */ + +#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c) + +/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. +Otherwise it has an undefined behaviour. */ + +#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) + +/* Returns TRUE, if the given value is not the first code unit of a UTF +sequence. */ + +#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) + +/* Get the next UTF-8 character, not advancing the pointer. This is called when +we know we are in UTF-8 mode. */ + +#define GETCHAR(c, eptr) \ + c = *eptr; \ + if (c >= 0xc0u) GETUTF8(c, eptr); + +/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the +pointer. */ + +#define GETCHARTEST(c, eptr) \ + c = *eptr; \ + if (utf && c >= 0xc0u) GETUTF8(c, eptr); + +/* Get the next UTF-8 character, advancing the pointer. This is called when we +know we are in UTF-8 mode. */ + +#define GETCHARINC(c, eptr) \ + c = *eptr++; \ + if (c >= 0xc0u) GETUTF8INC(c, eptr); + +/* Get the next character, testing for UTF-8 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-8 mode. */ + +#define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ + if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); + +/* Get the next UTF-8 character, not advancing the pointer, incrementing length +if there are extra bytes. This is called when we know we are in UTF-8 mode. */ + +#define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ + if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); + +/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the +pointer, incrementing length if there are extra bytes. This is called when we +do not know if we are in UTF-8 mode. */ + +#define GETCHARLENTEST(c, eptr, len) \ + c = *eptr; \ + if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); + +/* If the pointer is not at the start of a character, move it back until +it is. This is called only in UTF-8 mode - we don't put a test within the macro +because almost all calls are already within a block of UTF-8 only code. */ + +#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- + +/* Same as above, just in the other direction. */ +#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ +#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ + +/* Same as above, but it allows a fully customizable form. */ +#define ACROSSCHAR(condition, eptr, action) \ + while((condition) && ((*eptr) & 0xc0u) == 0x80u) action + +/* Deposit a character into memory, returning the number of code units. */ + +#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ + PRIV(ord2utf)(c,p) : (*p = c, 1)) + + +/* ------------------- 16-bit support ------------------ */ + +#elif PCRE2_CODE_UNIT_WIDTH == 16 +#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ + +/* The largest UTF code point that can be encoded as a single code unit. */ + +#define MAX_UTF_SINGLE_CU 65535 + +/* Tests whether the code point needs extra characters to decode. */ + +#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u) + +/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. +Otherwise it has an undefined behaviour. */ + +#define GET_EXTRALEN(c) 1 + +/* Returns TRUE, if the given value is not the first code unit of a UTF +sequence. */ + +#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) + +/* Base macro to pick up the low surrogate of a UTF-16 character, not +advancing the pointer. */ + +#define GETUTF16(c, eptr) \ + { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } + +/* Get the next UTF-16 character, not advancing the pointer. This is called when +we know we are in UTF-16 mode. */ + +#define GETCHAR(c, eptr) \ + c = *eptr; \ + if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); + +/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the +pointer. */ + +#define GETCHARTEST(c, eptr) \ + c = *eptr; \ + if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); + +/* Base macro to pick up the low surrogate of a UTF-16 character, advancing +the pointer. */ + +#define GETUTF16INC(c, eptr) \ + { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } + +/* Get the next UTF-16 character, advancing the pointer. This is called when we +know we are in UTF-16 mode. */ + +#define GETCHARINC(c, eptr) \ + c = *eptr++; \ + if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); + +/* Get the next character, testing for UTF-16 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-16 mode. */ + +#define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ + if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); + +/* Base macro to pick up the low surrogate of a UTF-16 character, not +advancing the pointer, incrementing the length. */ + +#define GETUTF16LEN(c, eptr, len) \ + { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } + +/* Get the next UTF-16 character, not advancing the pointer, incrementing +length if there is a low surrogate. This is called when we know we are in +UTF-16 mode. */ + +#define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ + if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); + +/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the +pointer, incrementing length if there is a low surrogate. This is called when +we do not know if we are in UTF-16 mode. */ + +#define GETCHARLENTEST(c, eptr, len) \ + c = *eptr; \ + if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); + +/* If the pointer is not at the start of a character, move it back until +it is. This is called only in UTF-16 mode - we don't put a test within the +macro because almost all calls are already within a block of UTF-16 only +code. */ + +#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- + +/* Same as above, just in the other direction. */ +#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ +#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ + +/* Same as above, but it allows a fully customizable form. */ +#define ACROSSCHAR(condition, eptr, action) \ + if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action + +/* Deposit a character into memory, returning the number of code units. */ + +#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ + PRIV(ord2utf)(c,p) : (*p = c, 1)) + + +/* ------------------- 32-bit support ------------------ */ + +#else + +/* These are trivial for the 32-bit library, since all UTF-32 characters fit +into one PCRE2_UCHAR unit. */ + +#define MAX_UTF_SINGLE_CU (0x10ffffu) +#define HAS_EXTRALEN(c) (0) +#define GET_EXTRALEN(c) (0) +#define NOT_FIRSTCU(c) (0) + +/* Get the next UTF-32 character, not advancing the pointer. This is called when +we know we are in UTF-32 mode. */ + +#define GETCHAR(c, eptr) \ + c = *(eptr); + +/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the +pointer. */ + +#define GETCHARTEST(c, eptr) \ + c = *(eptr); + +/* Get the next UTF-32 character, advancing the pointer. This is called when we +know we are in UTF-32 mode. */ + +#define GETCHARINC(c, eptr) \ + c = *((eptr)++); + +/* Get the next character, testing for UTF-32 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-32 mode. */ + +#define GETCHARINCTEST(c, eptr) \ + c = *((eptr)++); + +/* Get the next UTF-32 character, not advancing the pointer, not incrementing +length (since all UTF-32 is of length 1). This is called when we know we are in +UTF-32 mode. */ + +#define GETCHARLEN(c, eptr, len) \ + GETCHAR(c, eptr) + +/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the +pointer, not incrementing the length (since all UTF-32 is of length 1). +This is called when we do not know if we are in UTF-32 mode. */ + +#define GETCHARLENTEST(c, eptr, len) \ + GETCHARTEST(c, eptr) + +/* If the pointer is not at the start of a character, move it back until +it is. This is called only in UTF-32 mode - we don't put a test within the +macro because almost all calls are already within a block of UTF-32 only +code. + +These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */ + +#define BACKCHAR(eptr) do { } while (0) + +/* Same as above, just in the other direction. */ + +#define FORWARDCHAR(eptr) do { } while (0) +#define FORWARDCHARTEST(eptr,end) do { } while (0) + +/* Same as above, but it allows a fully customizable form. */ + +#define ACROSSCHAR(condition, eptr, action) do { } while (0) + +/* Deposit a character into memory, returning the number of code units. */ + +#define PUTCHAR(c, p) (*p = c, 1) + +#endif /* UTF-32 character handling */ +#endif /* SUPPORT_UNICODE */ + + +/* Mode-dependent macros that have the same definition in all modes. */ + +#define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8))) +#define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8))) +#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE +#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE + + +/* ----------------------- HIDDEN STRUCTURES ----------------------------- */ + +/* NOTE: All these structures *must* start with a pcre2_memctl structure. The +code that uses them is simpler because it assumes this. */ + +/* The real general context structure. At present it holds only data for custom +memory control. */ + +typedef struct pcre2_real_general_context { + pcre2_memctl memctl; +} pcre2_real_general_context; + +/* The real compile context structure */ + +typedef struct pcre2_real_compile_context { + pcre2_memctl memctl; + int (*stack_guard)(uint32_t, void *); + void *stack_guard_data; + const uint8_t *tables; + PCRE2_SIZE max_pattern_length; + uint16_t bsr_convention; + uint16_t newline_convention; + uint32_t parens_nest_limit; + uint32_t extra_options; +} pcre2_real_compile_context; + +/* The real match context structure. */ + +typedef struct pcre2_real_match_context { + pcre2_memctl memctl; +#ifdef SUPPORT_JIT + pcre2_jit_callback jit_callback; + void *jit_callback_data; +#endif + int (*callout)(pcre2_callout_block *, void *); + void *callout_data; + int (*substitute_callout)(pcre2_substitute_callout_block *, void *); + void *substitute_callout_data; + PCRE2_SIZE offset_limit; + uint32_t heap_limit; + uint32_t match_limit; + uint32_t depth_limit; +} pcre2_real_match_context; + +/* The real convert context structure. */ + +typedef struct pcre2_real_convert_context { + pcre2_memctl memctl; + uint32_t glob_separator; + uint32_t glob_escape; +} pcre2_real_convert_context; + +/* The real compiled code structure. The type for the blocksize field is +defined specially because it is required in pcre2_serialize_decode() when +copying the size from possibly unaligned memory into a variable of the same +type. Use a macro rather than a typedef to avoid compiler warnings when this +file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the +largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit +argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field +here.) */ + +#undef CODE_BLOCKSIZE_TYPE +#define CODE_BLOCKSIZE_TYPE PCRE2_SIZE + +#undef LOOKBEHIND_MAX +#define LOOKBEHIND_MAX UINT16_MAX + +typedef struct pcre2_real_code { + pcre2_memctl memctl; /* Memory control fields */ + const uint8_t *tables; /* The character tables */ + void *executable_jit; /* Pointer to JIT code */ + uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ + CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ + uint32_t magic_number; /* Paranoid and endianness check */ + uint32_t compile_options; /* Options passed to pcre2_compile() */ + uint32_t overall_options; /* Options after processing the pattern */ + uint32_t extra_options; /* Taken from compile_context */ + uint32_t flags; /* Various state flags */ + uint32_t limit_heap; /* Limit set in the pattern */ + uint32_t limit_match; /* Limit set in the pattern */ + uint32_t limit_depth; /* Limit set in the pattern */ + uint32_t first_codeunit; /* Starting code unit */ + uint32_t last_codeunit; /* This codeunit must be seen */ + uint16_t bsr_convention; /* What \R matches */ + uint16_t newline_convention; /* What is a newline? */ + uint16_t max_lookbehind; /* Longest lookbehind (characters) */ + uint16_t minlength; /* Minimum length of match */ + uint16_t top_bracket; /* Highest numbered group */ + uint16_t top_backref; /* Highest numbered back reference */ + uint16_t name_entry_size; /* Size (code units) of table entries */ + uint16_t name_count; /* Number of name entries in the table */ +} pcre2_real_code; + +/* The real match data structure. Define ovector as large as it can ever +actually be so that array bound checkers don't grumble. Memory for this +structure is obtained by calling pcre2_match_data_create(), which sets the size +as the offset of ovector plus a pair of elements for each capturable string, so +the size varies from call to call. As the maximum number of capturing +subpatterns is 65535 we must allow for 65536 strings to include the overall +match. (See also the heapframe structure below.) */ + +struct heapframe; /* Forward reference */ + +typedef struct pcre2_real_match_data { + pcre2_memctl memctl; /* Memory control fields */ + const pcre2_real_code *code; /* The pattern used for the match */ + PCRE2_SPTR subject; /* The subject that was matched */ + PCRE2_SPTR mark; /* Pointer to last mark */ + struct heapframe *heapframes; /* Backtracking frames heap memory */ + PCRE2_SIZE heapframes_size; /* Malloc-ed size */ + PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ + PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ + PCRE2_SIZE startchar; /* Offset to starting code unit */ + uint8_t matchedby; /* Type of match (normal, JIT, DFA) */ + uint8_t flags; /* Various flags */ + uint16_t oveccount; /* Number of pairs */ + int rc; /* The return code from the match */ + PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ +} pcre2_real_match_data; + + +/* ----------------------- PRIVATE STRUCTURES ----------------------------- */ + +/* These structures are not needed for pcre2test. */ + +#ifndef PCRE2_PCRE2TEST + +/* Structures for checking for mutual recursion when scanning compiled or +parsed code. */ + +typedef struct recurse_check { + struct recurse_check *prev; + PCRE2_SPTR group; +} recurse_check; + +typedef struct parsed_recurse_check { + struct parsed_recurse_check *prev; + uint32_t *groupptr; +} parsed_recurse_check; + +/* Structure for building a cache when filling in recursion offsets. */ + +typedef struct recurse_cache { + PCRE2_SPTR group; + int groupnumber; +} recurse_cache; + +/* Structure for maintaining a chain of pointers to the currently incomplete +branches, for testing for left recursion while compiling. */ + +typedef struct branch_chain { + struct branch_chain *outer; + PCRE2_UCHAR *current_branch; +} branch_chain; + +/* Structure for building a list of named groups during the first pass of +compiling. */ + +typedef struct named_group { + PCRE2_SPTR name; /* Points to the name in the pattern */ + uint32_t number; /* Group number */ + uint16_t length; /* Length of the name */ + uint16_t isdup; /* TRUE if a duplicate */ +} named_group; + +/* Structure for passing "static" information around between the functions +doing the compiling, so that they are thread-safe. */ + +typedef struct compile_block { + pcre2_real_compile_context *cx; /* Points to the compile context */ + const uint8_t *lcc; /* Points to lower casing table */ + const uint8_t *fcc; /* Points to case-flipping table */ + const uint8_t *cbits; /* Points to character type table */ + const uint8_t *ctypes; /* Points to table of type maps */ + PCRE2_SPTR start_workspace; /* The start of working space */ + PCRE2_SPTR start_code; /* The start of the compiled code */ + PCRE2_SPTR start_pattern; /* The start of the pattern */ + PCRE2_SPTR end_pattern; /* The end of the pattern */ + PCRE2_UCHAR *name_table; /* The name/number table */ + PCRE2_SIZE workspace_size; /* Size of workspace */ + PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */ + PCRE2_SIZE erroroffset; /* Offset of error in pattern */ + uint16_t names_found; /* Number of entries so far */ + uint16_t name_entry_size; /* Size of each entry */ + uint16_t parens_depth; /* Depth of nested parentheses */ + uint16_t assert_depth; /* Depth of nested assertions */ + open_capitem *open_caps; /* Chain of open capture items */ + named_group *named_groups; /* Points to vector in pre-compile */ + uint32_t named_group_list_size; /* Number of entries in the list */ + uint32_t external_options; /* External (initial) options */ + uint32_t external_flags; /* External flag bits to be set */ + uint32_t bracount; /* Count of capturing parentheses */ + uint32_t lastcapture; /* Last capture encountered */ + uint32_t *parsed_pattern; /* Parsed pattern buffer */ + uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ + uint32_t *groupinfo; /* Group info vector */ + uint32_t top_backref; /* Maximum back reference */ + uint32_t backref_map; /* Bitmap of low back refs */ + uint32_t nltype; /* Newline type */ + uint32_t nllen; /* Newline string length */ + uint32_t class_range_start; /* Overall class range start */ + uint32_t class_range_end; /* Overall class range end */ + PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ + uint32_t req_varyopt; /* "After variable item" flag for reqbyte */ + int max_lookbehind; /* Maximum lookbehind (characters) */ + BOOL had_accept; /* (*ACCEPT) encountered */ + BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ + BOOL had_recurse; /* Had a recursion or subroutine call */ + BOOL dupnames; /* Duplicate names exist */ +} compile_block; + +/* Structure for keeping the properties of the in-memory stack used +by the JIT matcher. */ + +typedef struct pcre2_real_jit_stack { + pcre2_memctl memctl; + void* stack; +} pcre2_real_jit_stack; + +/* Structure for items in a linked list that represents an explicit recursive +call within the pattern when running pcre2_dfa_match(). */ + +typedef struct dfa_recursion_info { + struct dfa_recursion_info *prevrec; + PCRE2_SPTR subject_position; + uint32_t group_num; +} dfa_recursion_info; + +/* Structure for "stack" frames that are used for remembering backtracking +positions during matching. As these are used in a vector, with the ovector item +being extended, the size of the structure must be a multiple of PCRE2_SIZE. The +only way to check this at compile time is to force an error by generating an +array with a negative size. By putting this in a typedef (which is never used), +we don't generate any code when all is well. */ + +typedef struct heapframe { + + /* The first set of fields are variables that have to be preserved over calls + to RRMATCH(), but which do not need to be copied to new frames. */ + + PCRE2_SPTR ecode; /* The current position in the pattern */ + PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */ + PCRE2_SIZE length; /* Used for character, string, or code lengths */ + PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */ + PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */ + uint32_t rdepth; /* "Recursion" depth */ + uint32_t group_frame_type; /* Type information for group frames */ + uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */ + uint8_t return_id; /* Where to go on in internal "return" */ + uint8_t op; /* Processing opcode */ + + /* At this point, the structure is 16-bit aligned. On most architectures + the alignment requirement for a pointer will ensure that the eptr field below + is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer + that is 16-bit aligned. We must therefore ensure that what comes between here + and eptr is an odd multiple of 16 bits so as to get back into 32-bit + alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs + fudges in the other cases. In the 32-bit case the padding comes first so that + the occu field itself is 32-bit aligned. Without the padding, this structure + is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + PCRE2_UCHAR occu[6]; /* Used for other case code units */ +#elif PCRE2_CODE_UNIT_WIDTH == 16 + PCRE2_UCHAR occu[2]; /* Used for other case code units */ + uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ +#else + uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ + PCRE2_UCHAR occu[1]; /* Used for other case code units */ +#endif + + /* The rest have to be copied from the previous frame whenever a new frame + becomes current. The final field is specified as a large vector so that + runtime array bound checks don't catch references to it. However, for any + specific call to pcre2_match() the memory allocated for each frame structure + allows for exactly the right size ovector for the number of capturing + parentheses. (See also the comment for pcre2_real_match_data above.) */ + + PCRE2_SPTR eptr; /* MUST BE FIRST */ + PCRE2_SPTR start_match; /* Can be adjusted by \K */ + PCRE2_SPTR mark; /* Most recent mark on the success path */ + uint32_t current_recurse; /* Current (deepest) recursion number */ + uint32_t capture_last; /* Most recent capture */ + PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */ + PCRE2_SIZE offset_top; /* Offset after highest capture */ + PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ +} heapframe; + +/* This typedef is a check that the size of the heapframe structure is a +multiple of PCRE2_SIZE. See various comments above. */ + +typedef char check_heapframe_size[ + ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)]; + +/* Structure for computing the alignment of heapframe. */ + +typedef struct heapframe_align { + char unalign; /* Completely unalign the current offset */ + heapframe frame; /* Offset is its alignment */ +} heapframe_align; + +/* This define is the minimum alignment required for a heapframe, in bytes. */ + +#define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame) + +/* Structure for passing "static" information around between the functions +doing traditional NFA matching (pcre2_match() and friends). */ + +typedef struct match_block { + pcre2_memctl memctl; /* For general use */ + uint32_t heap_limit; /* As it says */ + uint32_t match_limit; /* As it says */ + uint32_t match_limit_depth; /* As it says */ + uint32_t match_call_count; /* Number of times a new frame is created */ + BOOL hitend; /* Hit the end of the subject at some point */ + BOOL hasthen; /* Pattern contains (*THEN) */ + BOOL allowemptypartial; /* Allow empty hard partial */ + const uint8_t *lcc; /* Points to lower casing table */ + const uint8_t *fcc; /* Points to case-flipping table */ + const uint8_t *ctypes; /* Points to table of type maps */ + PCRE2_SIZE start_offset; /* The start offset value */ + PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ + uint16_t partial; /* PARTIAL options */ + uint16_t bsr_convention; /* \R interpretation */ + uint16_t name_count; /* Number of names in name table */ + uint16_t name_entry_size; /* Size of entry in names table */ + PCRE2_SPTR name_table; /* Table of group names */ + PCRE2_SPTR start_code; /* For use when recursing */ + PCRE2_SPTR start_subject; /* Start of the subject string */ + PCRE2_SPTR check_subject; /* Where UTF-checked from */ + PCRE2_SPTR end_subject; /* End of the subject string */ + PCRE2_SPTR end_match_ptr; /* Subject position at end match */ + PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ + PCRE2_SPTR last_used_ptr; /* Latest consulted character */ + PCRE2_SPTR mark; /* Mark pointer to pass back on success */ + PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */ + PCRE2_SPTR verb_ecode_ptr; /* For passing back info */ + PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */ + uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */ + uint32_t moptions; /* Match options */ + uint32_t poptions; /* Pattern options */ + uint32_t skip_arg_count; /* For counting SKIP_ARGs */ + uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */ + uint32_t nltype; /* Newline type */ + uint32_t nllen; /* Newline string length */ + PCRE2_UCHAR nl[4]; /* Newline string when fixed */ + pcre2_callout_block *cb; /* Points to a callout block */ + void *callout_data; /* To pass back to callouts */ + int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ +} match_block; + +/* A similar structure is used for the same purpose by the DFA matching +functions. */ + +typedef struct dfa_match_block { + pcre2_memctl memctl; /* For general use */ + PCRE2_SPTR start_code; /* Start of the compiled pattern */ + PCRE2_SPTR start_subject ; /* Start of the subject string */ + PCRE2_SPTR end_subject; /* End of subject string */ + PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ + PCRE2_SPTR last_used_ptr; /* Latest consulted character */ + const uint8_t *tables; /* Character tables */ + PCRE2_SIZE start_offset; /* The start offset value */ + uint32_t heap_limit; /* As it says */ + PCRE2_SIZE heap_used; /* As it says */ + uint32_t match_limit; /* As it says */ + uint32_t match_limit_depth; /* As it says */ + uint32_t match_call_count; /* Number of calls of internal function */ + uint32_t moptions; /* Match options */ + uint32_t poptions; /* Pattern options */ + uint32_t nltype; /* Newline type */ + uint32_t nllen; /* Newline string length */ + BOOL allowemptypartial; /* Allow empty hard partial */ + PCRE2_UCHAR nl[4]; /* Newline string when fixed */ + uint16_t bsr_convention; /* \R interpretation */ + pcre2_callout_block *cb; /* Points to a callout block */ + void *callout_data; /* To pass back to callouts */ + int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ + dfa_recursion_info *recursive; /* Linked list of recursion data */ +} dfa_match_block; + +#endif /* PCRE2_PCRE2TEST */ + +/* End of pcre2_intmodedep.h */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_maketables.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_maketables.c new file mode 100644 index 0000000000..246f0a646e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_maketables.c @@ -0,0 +1,163 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains the external function pcre2_maketables(), which builds +character tables for PCRE2 in the current locale. The file is compiled on its +own as part of the PCRE2 library. It is also included in the compilation of +pcre2_dftables.c as a freestanding program, in which case the macro +PCRE2_DFTABLES is defined. */ + +#ifndef PCRE2_DFTABLES /* Compiling the library */ +# ifdef HAVE_CONFIG_H +# include "regexp/pcre2/config.h" +# endif +# include "regexp/pcre2/pcre2_internal.h" +#endif + + + +/************************************************* +* Create PCRE2 character tables * +*************************************************/ + +/* This function builds a set of character tables for use by PCRE2 and returns +a pointer to them. They are build using the ctype functions, and consequently +their contents will depend upon the current locale setting. When compiled as +part of the library, the store is obtained via a general context malloc, if +supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables +freestanding auxiliary program) malloc() is used, and the function has a +different name so as not to clash with the prototype in pcre2.h. + +Arguments: none when PCRE2_DFTABLES is defined + else a PCRE2 general context or NULL +Returns: pointer to the contiguous block of data + else NULL if memory allocation failed +*/ + +#ifdef PCRE2_DFTABLES /* Included in freestanding pcre2_dftables program */ +static const uint8_t *maketables(void) +{ +uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH); + +#else /* Not PCRE2_DFTABLES, that is, compiling the library */ +PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION +pcre2_maketables(pcre2_general_context *gcontext) +{ +uint8_t *yield = (uint8_t *)((gcontext != NULL)? + gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) : + malloc(TABLES_LENGTH)); +#endif /* PCRE2_DFTABLES */ + +int i; +uint8_t *p; + +if (yield == NULL) return NULL; +p = yield; + +/* First comes the lower casing table */ + +for (i = 0; i < 256; i++) *p++ = tolower(i); + +/* Next the case-flipping table */ + +for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); + +/* Then the character class tables. Don't try to be clever and save effort on +exclusive ones - in some locales things may be different. + +Note that the table for "space" includes everything "isspace" gives, including +VT in the default locale. This makes it work for the POSIX class [:space:]. +From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl +space, because Perl added VT at release 5.18. + +Note also that it is possible for a character to be alnum or alpha without +being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the +fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must +test for alnum specially. */ + +memset(p, 0, cbit_length); +for (i = 0; i < 256; i++) + { + if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7); + if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7); + if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7); + if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7); + if (i == '_') p[cbit_word + i/8] |= 1u << (i&7); + if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7); + if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7); + if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7); + if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7); + if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7); + if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7); + } +p += cbit_length; + +/* Finally, the character type table. In this, we used to exclude VT from the +white space chars, because Perl didn't recognize it as such for \s and for +comments within regexes. However, Perl changed at release 5.18, so PCRE1 +changed at release 8.34 and it's always been this way for PCRE2. */ + +for (i = 0; i < 256; i++) + { + int x = 0; + if (isspace(i)) x += ctype_space; + if (isalpha(i)) x += ctype_letter; + if (islower(i)) x += ctype_lcletter; + if (isdigit(i)) x += ctype_digit; + if (isalnum(i) || i == '_') x += ctype_word; + *p++ = x; + } + +return yield; +} + +#ifndef PCRE2_DFTABLES /* Compiling the library */ +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables) +{ + if (gcontext) + gcontext->memctl.free((void *)tables, gcontext->memctl.memory_data); + else + free((void *)tables); +} +#endif + +/* End of pcre2_maketables.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_match.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_match.c new file mode 100644 index 0000000000..9ba0ae8229 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_match.c @@ -0,0 +1,7560 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2015-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +/* These defines enable debugging code */ + +/* #define DEBUG_FRAMES_DISPLAY */ +/* #define DEBUG_SHOW_OPS */ +/* #define DEBUG_SHOW_RMATCH */ + +#ifdef DEBUG_FRAMES_DISPLAY +#include +#endif + +/* These defines identify the name of the block containing "static" +information, and fields within it. */ + +#define NLBLOCK mb /* Block containing newline information */ +#define PSSTART start_subject /* Field containing processed string start */ +#define PSEND end_subject /* Field containing processed string end */ + +#include "regexp/pcre2/pcre2_internal.h" + +#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ + +/* Masks for identifying the public options that are permitted at match time. */ + +#define PUBLIC_MATCH_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ + PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ + PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT) + +#define PUBLIC_JIT_MATCH_OPTIONS \ + (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ + PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ + PCRE2_COPY_MATCHED_SUBJECT) + +/* Non-error returns from and within the match() function. Error returns are +externally defined PCRE2_ERROR_xxx codes, which are all negative. */ + +#define MATCH_MATCH 1 +#define MATCH_NOMATCH 0 + +/* Special internal returns used in the match() function. Make them +sufficiently negative to avoid the external error codes. */ + +#define MATCH_ACCEPT (-999) +#define MATCH_KETRPOS (-998) +/* The next 5 must be kept together and in sequence so that a test that checks +for any one of them can use a range. */ +#define MATCH_COMMIT (-997) +#define MATCH_PRUNE (-996) +#define MATCH_SKIP (-995) +#define MATCH_SKIP_ARG (-994) +#define MATCH_THEN (-993) +#define MATCH_BACKTRACK_MAX MATCH_THEN +#define MATCH_BACKTRACK_MIN MATCH_COMMIT + +/* Group frame type values. Zero means the frame is not a group frame. The +lower 16 bits are used for data (e.g. the capture number). Group frames are +used for most groups so that information about the start is easily available at +the end without having to scan back through intermediate frames (backtrack +points). */ + +#define GF_CAPTURE 0x00010000u +#define GF_NOCAPTURE 0x00020000u +#define GF_CONDASSERT 0x00030000u +#define GF_RECURSE 0x00040000u + +/* Masks for the identity and data parts of the group frame type. */ + +#define GF_IDMASK(a) ((a) & 0xffff0000u) +#define GF_DATAMASK(a) ((a) & 0x0000ffffu) + +/* Repetition types */ + +enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; + +/* Min and max values for the common repeats; a maximum of UINT32_MAX => +infinity. */ + +static const uint32_t rep_min[] = { + 0, 0, /* * and *? */ + 1, 1, /* + and +? */ + 0, 0, /* ? and ?? */ + 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ + 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ + +static const uint32_t rep_max[] = { + UINT32_MAX, UINT32_MAX, /* * and *? */ + UINT32_MAX, UINT32_MAX, /* + and +? */ + 1, 1, /* ? and ?? */ + 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ + UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ + +/* Repetition types - must include OP_CRPOSRANGE (not needed above) */ + +static const uint32_t rep_typ[] = { + REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ + REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ + REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ + REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ + REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ + REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ + +/* Numbers for RMATCH calls at backtracking points. When these lists are +changed, the code at RETURN_SWITCH below must be updated in sync. */ + +enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, + RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, + RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, + RM31, RM32, RM33, RM34, RM35, RM36 }; + +#ifdef SUPPORT_WIDE_CHARS +enum { RM100=100, RM101 }; +#endif + +#ifdef SUPPORT_UNICODE +enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, + RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, + RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223, + RM224, RM225 }; +#endif + +/* Define short names for general fields in the current backtrack frame, which +is always pointed to by the F variable. Occasional references to fields in +other frames are written out explicitly. There are also some fields in the +current frame whose names start with "temp" that are used for short-term, +localised backtracking memory. These are #defined with Lxxx names at the point +of use and undefined afterwards. */ + +#define Fback_frame F->back_frame +#define Fcapture_last F->capture_last +#define Fcurrent_recurse F->current_recurse +#define Fecode F->ecode +#define Feptr F->eptr +#define Fgroup_frame_type F->group_frame_type +#define Flast_group_offset F->last_group_offset +#define Flength F->length +#define Fmark F->mark +#define Frdepth F->rdepth +#define Fstart_match F->start_match +#define Foffset_top F->offset_top +#define Foccu F->occu +#define Fop F->op +#define Fovector F->ovector +#define Freturn_id F->return_id + + +#ifdef DEBUG_FRAMES_DISPLAY +/************************************************* +* Display current frames and contents * +*************************************************/ + +/* This debugging function displays the current set of frames and their +contents. It is not called automatically from anywhere, the intention being +that calls can be inserted where necessary when debugging frame-related +problems. + +Arguments: + f the file to write to + F the current top frame + P a previous frame of interest + frame_size the frame size + mb points to the match block + match_data points to the match data block + s identification text + +Returns: nothing +*/ + +static void +display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, + match_block *mb, pcre2_match_data *match_data, const char *s, ...) +{ +uint32_t i; +heapframe *Q; +va_list ap; +va_start(ap, s); + +fprintf(f, "FRAMES "); +vfprintf(f, s, ap); +va_end(ap); + +if (P != NULL) fprintf(f, " P=%lu", + ((char *)P - (char *)(match_data->heapframes))/frame_size); +fprintf(f, "\n"); + +for (i = 0, Q = match_data->heapframes; + Q <= F; + i++, Q = (heapframe *)((char *)Q + frame_size)) + { + fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", + i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), + Q->back_frame, Q->return_id); + + if (Q->last_group_offset == PCRE2_UNSET) + fprintf(f, " lgoffset=unset\n"); + else + fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); + } +} + +#endif + + + +/************************************************* +* Process a callout * +*************************************************/ + +/* This function is called for all callouts, whether "standalone" or at the +start of a conditional group. Feptr will be pointing to either OP_CALLOUT or +OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized +with fixed values. + +Arguments: + F points to the current backtracking frame + mb points to the match block + lengthptr where to return the length of the callout item + +Returns: the return from the callout + or 0 if no callout function exists +*/ + +static int +do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) +{ +int rc; +PCRE2_SIZE save0, save1; +PCRE2_SIZE *callout_ovector; +pcre2_callout_block *cb; + +*lengthptr = (*Fecode == OP_CALLOUT)? + PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); + +if (mb->callout == NULL) return 0; /* No callout function provided */ + +/* The original matching code (pre 10.30) worked directly with the ovector +passed by the user, and this was passed to callouts. Now that the working +ovector is in the backtracking frame, it no longer needs to reserve space for +the overall match offsets (which would waste space in the frame). For backward +compatibility, however, we pass capture_top and offset_vector to the callout as +if for the extended ovector, and we ensure that the first two slots are unset +by preserving and restoring their current contents. Picky compilers complain if +references such as Fovector[-2] are use directly, so we set up a separate +pointer. */ + +callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; + +/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields +are set externally. The first 3 never change; the last is updated for each +bumpalong. */ + +cb = mb->cb; +cb->capture_top = (uint32_t)Foffset_top/2 + 1; +cb->capture_last = Fcapture_last; +cb->offset_vector = callout_ovector; +cb->mark = mb->nomatch_mark; +cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); +cb->pattern_position = GET(Fecode, 1); +cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); + +if (*Fecode == OP_CALLOUT) /* Numerical callout */ + { + cb->callout_number = Fecode[1 + 2*LINK_SIZE]; + cb->callout_string_offset = 0; + cb->callout_string = NULL; + cb->callout_string_length = 0; + } +else /* String callout */ + { + cb->callout_number = 0; + cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); + cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; + cb->callout_string_length = + *lengthptr - (1 + 4*LINK_SIZE) - 2; + } + +save0 = callout_ovector[0]; +save1 = callout_ovector[1]; +callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; +rc = mb->callout(cb, mb->callout_data); +callout_ovector[0] = save0; +callout_ovector[1] = save1; +cb->callout_flags = 0; +return rc; +} + + + +/************************************************* +* Match a back-reference * +*************************************************/ + +/* This function is called only when it is known that the offset lies within +the offsets that have so far been used in the match. Note that in caseless +UTF-8 mode, the number of subject bytes matched may be different to the number +of reference bytes. (In theory this could also happen in UTF-16 mode, but it +seems unlikely.) + +Arguments: + offset index into the offset vector + caseless TRUE if caseless + F the current backtracking frame pointer + mb points to match block + lengthptr pointer for returning the length matched + +Returns: = 0 sucessful match; number of code units matched is set + < 0 no match + > 0 partial match +*/ + +static int +match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, + PCRE2_SIZE *lengthptr) +{ +PCRE2_SPTR p; +PCRE2_SIZE length; +PCRE2_SPTR eptr; +PCRE2_SPTR eptr_start; + +/* Deal with an unset group. The default is no match, but there is an option to +match an empty string. */ + +if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) + { + if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) + { + *lengthptr = 0; + return 0; /* Match */ + } + else return -1; /* No match */ + } + +/* Separate the caseless and UTF cases for speed. */ + +eptr = eptr_start = Feptr; +p = mb->start_subject + Fovector[offset]; +length = Fovector[offset+1] - Fovector[offset]; + +if (caseless) + { +#if defined SUPPORT_UNICODE + BOOL utf = (mb->poptions & PCRE2_UTF) != 0; + + if (utf || (mb->poptions & PCRE2_UCP) != 0) + { + PCRE2_SPTR endptr = p + length; + + /* Match characters up to the end of the reference. NOTE: the number of + code units matched may differ, because in UTF-8 there are some characters + whose upper and lower case codes have different numbers of bytes. For + example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 + bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a + sequence of two of the latter. It is important, therefore, to check the + length along the reference, not along the subject (earlier code did this + wrong). UCP without uses Unicode properties but without UTF encoding. */ + + while (p < endptr) + { + uint32_t c, d; + const ucd_record *ur; + if (eptr >= mb->end_subject) return 1; /* Partial match */ + + if (utf) + { + GETCHARINC(c, eptr); + GETCHARINC(d, p); + } + else + { + c = *eptr++; + d = *p++; + } + + ur = GET_UCD(d); + if (c != d && c != (uint32_t)((int)d + ur->other_case)) + { + const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; + for (;;) + { + if (c < *pp) return -1; /* No match */ + if (c == *pp++) break; + } + } + } + } + else +#endif + + /* Not in UTF or UCP mode */ + { + for (; length > 0; length--) + { + uint32_t cc, cp; + if (eptr >= mb->end_subject) return 1; /* Partial match */ + cc = UCHAR21TEST(eptr); + cp = UCHAR21TEST(p); + if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) + return -1; /* No match */ + p++; + eptr++; + } + } + } + +/* In the caseful case, we can just compare the code units, whether or not we +are in UTF and/or UCP mode. When partial matching, we have to do this unit by +unit. */ + +else + { + if (mb->partial != 0) + { + for (; length > 0; length--) + { + if (eptr >= mb->end_subject) return 1; /* Partial match */ + if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ + } + } + + /* Not partial matching */ + + else + { + if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */ + if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ + eptr += length; + } + } + +*lengthptr = eptr - eptr_start; +return 0; /* Match */ +} + + + +/****************************************************************************** +******************************************************************************* + "Recursion" in the match() function + +The original match() function was highly recursive, but this proved to be the +source of a number of problems over the years, mostly because of the relatively +small system stacks that are commonly found. As new features were added to +patterns, various kludges were invented to reduce the amount of stack used, +making the code hard to understand in places. + +A version did exist that used individual frames on the heap instead of calling +match() recursively, but this ran substantially slower. The current version is +a refactoring that uses a vector of frames to remember backtracking points. +This runs no slower, and possibly even a bit faster than the original recursive +implementation. + +At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50 +frames) was allocated on the system stack. If this was not big enough, the heap +was used for a larger vector. However, it turns out that there are environments +where taking as little as 20KiB from the system stack is an embarrassment. +After another refactoring, the heap is used exclusively, but a pointer the +frames vector and its size are cached in the match_data block, so that there is +no new memory allocation if the same match_data block is used for multiple +matches (unless the frames vector has to be extended). +******************************************************************************* +******************************************************************************/ + + + + +/************************************************* +* Macros for the match() function * +*************************************************/ + +/* These macros pack up tests that are used for partial matching several times +in the code. The second one is used when we already know we are past the end of +the subject. We set the "hit end" flag if the pointer is at the end of the +subject and either (a) the pointer is past the earliest inspected character +(i.e. something has been matched, even if not part of the actual matched +string), or (b) the pattern contains a lookbehind. These are the conditions for +which adding more characters may allow the current match to continue. + +For hard partial matching, we immediately return a partial match. Otherwise, +carrying on means that a complete match on the current subject will be sought. +A partial match is returned only if no complete match can be found. */ + +#define CHECK_PARTIAL()\ + if (Feptr >= mb->end_subject) \ + { \ + SCHECK_PARTIAL(); \ + } + +#define SCHECK_PARTIAL()\ + if (mb->partial != 0 && \ + (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ + { \ + mb->hitend = TRUE; \ + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ + } + + +/* These macros are used to implement backtracking. They simulate a recursive +call to the match() function by means of a local vector of frames which +remember the backtracking points. */ + +#define RMATCH(ra,rb)\ + {\ + start_ecode = ra;\ + Freturn_id = rb;\ + goto MATCH_RECURSE;\ + L_##rb:;\ + } + +#define RRETURN(ra)\ + {\ + rrc = ra;\ + goto RETURN_SWITCH;\ + } + + + +/************************************************* +* Match from current position * +*************************************************/ + +/* This function is called to run one match attempt at a single starting point +in the subject. + +Performance note: It might be tempting to extract commonly used fields from the +mb structure (e.g. end_subject) into individual variables to improve +performance. Tests using gcc on a SPARC disproved this; in the first case, it +made performance worse. + +Arguments: + start_eptr starting character in subject + start_ecode starting position in compiled code + top_bracket number of capturing parentheses in the pattern + frame_size size of each backtracking frame + match_data pointer to the match_data block + mb pointer to "static" variables block + +Returns: MATCH_MATCH if matched ) these values are >= 0 + MATCH_NOMATCH if failed to match ) + negative MATCH_xxx value for PRUNE, SKIP, etc + negative PCRE2_ERROR_xxx value if aborted by an error condition + (e.g. stopped by repeated call or depth limit) +*/ + +static int +match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, + PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb) +{ +/* Frame-handling variables */ + +heapframe *F; /* Current frame pointer */ +heapframe *N = NULL; /* Temporary frame pointers */ +heapframe *P = NULL; + +heapframe *frames_top; /* End of frames vector */ +heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ +PCRE2_SIZE heapframes_size; /* Usable size of frames vector */ +PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ + +/* Local variables that do not need to be preserved over calls to RRMATCH(). */ + +PCRE2_SPTR bracode; /* Temp pointer to start of group */ +PCRE2_SIZE offset; /* Used for group offsets */ +PCRE2_SIZE length; /* Used for various length calculations */ + +int rrc; /* Return from functions & backtracking "recursions" */ +#ifdef SUPPORT_UNICODE +int proptype; /* Type of character property */ +#endif + +uint32_t i; /* Used for local loops */ +uint32_t fc; /* Character values */ +uint32_t number; /* Used for group and other numbers */ +uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ +uint32_t group_frame_type; /* Specifies type for new group frames */ + +BOOL condition; /* Used in conditional groups */ +BOOL cur_is_word; /* Used in "word" tests */ +BOOL prev_is_word; /* Used in "word" tests */ + +/* UTF and UCP flags */ + +#ifdef SUPPORT_UNICODE +BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; +#else +BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ +#endif + +/* This is the length of the last part of a backtracking frame that must be +copied when a new frame is created. */ + +frame_copy_size = frame_size - offsetof(heapframe, eptr); + +/* Set up the first frame and the end of the frames vector. We set the local +heapframes_size to the usuable amount of the vector, that is, a whole number of +frames. */ + +F = match_data->heapframes; +heapframes_size = (match_data->heapframes_size / frame_size) * frame_size; +frames_top = (heapframe *)((char *)F + heapframes_size); + +Frdepth = 0; /* "Recursion" depth */ +Fcapture_last = 0; /* Number of most recent capture */ +Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ +Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ +Fmark = NULL; /* Most recent mark */ +Foffset_top = 0; /* End of captures within the frame */ +Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ +group_frame_type = 0; /* Not a start of group frame */ +goto NEW_FRAME; /* Start processing with this frame */ + +/* Come back here when we want to create a new frame for remembering a +backtracking point. */ + +MATCH_RECURSE: + +/* Set up a new backtracking frame. If the vector is full, get a new one, +doubling the size, but constrained by the heap limit (which is in KiB). */ + +N = (heapframe *)((char *)F + frame_size); +if (N >= frames_top) + { + heapframe *new; + PCRE2_SIZE newsize; + + if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2) + { + if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1) + return PCRE2_ERROR_NOMEMORY; + newsize = PCRE2_SIZE_MAX - 1; + } + else + newsize = match_data->heapframes_size * 2; + + if (newsize / 1024 >= mb->heap_limit) + { + PCRE2_SIZE old_size = match_data->heapframes_size / 1024; + if (mb->heap_limit <= old_size) return PCRE2_ERROR_HEAPLIMIT; + else + { + PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size); + int over_bytes = match_data->heapframes_size % 1024; + if (over_bytes) max_delta -= (1024 - over_bytes); + newsize = match_data->heapframes_size + max_delta; + } + } + + new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data); + if (new == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy(new, match_data->heapframes, heapframes_size); + + F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes)); + N = (heapframe *)((char *)F + frame_size); + + match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data); + match_data->heapframes = new; + match_data->heapframes_size = newsize; + + heapframes_size = (newsize / frame_size) * frame_size; + frames_top = (heapframe *)((char *)new + heapframes_size); + } + +#ifdef DEBUG_SHOW_RMATCH +fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1); +if (group_frame_type != 0) + { + fprintf(stderr, " type=%x ", group_frame_type); + switch (GF_IDMASK(group_frame_type)) + { + case GF_CAPTURE: + fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type)); + break; + + case GF_NOCAPTURE: + fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type)); + break; + + case GF_CONDASSERT: + fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type)); + break; + + case GF_RECURSE: + fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type)); + break; + + default: + fprintf(stderr, "*** unknown ***"); + break; + } + } +fprintf(stderr, "\n"); +#endif + +/* Copy those fields that must be copied into the new frame, increase the +"recursion" depth (i.e. the new frame's index) and then make the new frame +current. */ + +memcpy((char *)N + offsetof(heapframe, eptr), + (char *)F + offsetof(heapframe, eptr), + frame_copy_size); + +N->rdepth = Frdepth + 1; +F = N; + +/* Carry on processing with a new frame. */ + +NEW_FRAME: +Fgroup_frame_type = group_frame_type; +Fecode = start_ecode; /* Starting code pointer */ +Fback_frame = frame_size; /* Default is go back one frame */ + +/* If this is a special type of group frame, remember its offset for quick +access at the end of the group. If this is a recursion, set a new current +recursion value. */ + +if (group_frame_type != 0) + { + Flast_group_offset = (char *)F - (char *)match_data->heapframes; + if (GF_IDMASK(group_frame_type) == GF_RECURSE) + Fcurrent_recurse = GF_DATAMASK(group_frame_type); + group_frame_type = 0; + } + + +/* ========================================================================= */ +/* This is the main processing loop. First check that we haven't recorded too +many backtracks (search tree is too large), or that we haven't exceeded the +recursive depth limit (used too many backtracking frames). If not, process the +opcodes. */ + +if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; +if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; + +for (;;) + { +#ifdef DEBUG_SHOW_OPS +fprintf(stderr, "++ op=%d\n", *Fecode); +#endif + + Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ + switch(Fop) + { + /* ===================================================================== */ + /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close + any currently open capturing brackets. Unlike reaching the end of a group, + where we know the starting frame is at the top of the chained frames, in + this case we have to search back for the relevant frame in case other types + of group that use chained frames have intervened. Multiple OP_CLOSEs always + come innermost first, which matches the chain order. We can ignore this in + a recursion, because captures are not passed out of recursions. */ + + case OP_CLOSE: + if (Fcurrent_recurse == RECURSE_UNSET) + { + number = GET2(Fecode, 1); + offset = Flast_group_offset; + for(;;) + { + if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; + N = (heapframe *)((char *)match_data->heapframes + offset); + P = (heapframe *)((char *)N - frame_size); + if (N->group_frame_type == (GF_CAPTURE | number)) break; + offset = P->last_group_offset; + } + offset = (number << 1) - 2; + Fcapture_last = number; + Fovector[offset] = P->eptr - mb->start_subject; + Fovector[offset+1] = Feptr - mb->start_subject; + if (offset >= Foffset_top) Foffset_top = offset + 2; + } + Fecode += PRIV(OP_lengths)[*Fecode]; + break; + + + /* ===================================================================== */ + /* Real or forced end of the pattern, assertion, or recursion. In an + assertion ACCEPT, update the last used pointer and remember the current + frame so that the captures and mark can be fished out of it. */ + + case OP_ASSERT_ACCEPT: + if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; + assert_accept_frame = F; + RRETURN(MATCH_ACCEPT); + + /* If recursing, we have to find the most recent recursion. */ + + case OP_ACCEPT: + case OP_END: + + /* Handle end of a recursion. */ + + if (Fcurrent_recurse != RECURSE_UNSET) + { + offset = Flast_group_offset; + for(;;) + { + if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; + N = (heapframe *)((char *)match_data->heapframes + offset); + P = (heapframe *)((char *)N - frame_size); + if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; + offset = P->last_group_offset; + } + + /* N is now the frame of the recursion; the previous frame is at the + OP_RECURSE position. Go back there, copying the current subject position + and mark, and the start_match position (\K might have changed it), and + then move on past the OP_RECURSE. */ + + P->eptr = Feptr; + P->mark = Fmark; + P->start_match = Fstart_match; + F = P; + Fecode += 1 + LINK_SIZE; + continue; + } + + /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY + is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the + start of the subject. In both cases, backtracking will then try other + alternatives, if any. */ + + if (Feptr == Fstart_match && + ((mb->moptions & PCRE2_NOTEMPTY) != 0 || + ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && + Fstart_match == mb->start_subject + mb->start_offset))) + RRETURN(MATCH_NOMATCH); + + /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not + the end of the subject. After (*ACCEPT) we fail the entire match (at this + position) but backtrack on reaching the end of the pattern. */ + + if (Feptr < mb->end_subject && + ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) + { + if (Fop == OP_END) RRETURN(MATCH_NOMATCH); + return MATCH_NOMATCH; + } + + /* We have a successful match of the whole pattern. Record the result and + then do a direct return from the function. If there is space in the offset + vector, set any pairs that follow the highest-numbered captured string but + are less than the number of capturing groups in the pattern to PCRE2_UNSET. + It is documented that this happens. "Gaps" are set to PCRE2_UNSET + dynamically. It is only those at the end that need setting here. */ + + mb->end_match_ptr = Feptr; /* Record where we ended */ + mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ + mb->mark = Fmark; /* and the last success mark */ + if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; + + match_data->ovector[0] = Fstart_match - mb->start_subject; + match_data->ovector[1] = Feptr - mb->start_subject; + + /* Set i to the smaller of the sizes of the external and frame ovectors. */ + + i = 2 * ((top_bracket + 1 > match_data->oveccount)? + match_data->oveccount : top_bracket + 1); + memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); + while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET; + return MATCH_MATCH; /* Note: NOT RRETURN */ + + + /*===================================================================== */ + /* Match any single character type except newline; have to take care with + CRLF newlines and partial matching. */ + + case OP_ANY: + if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); + if (mb->partial != 0 && + Feptr == mb->end_subject - 1 && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + /* Fall through */ + + /* Match any single character whatsoever. */ + + case OP_ALLANY: + if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ + { /* not be updated before SCHECK_PARTIAL. */ + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + Feptr++; +#ifdef SUPPORT_UNICODE + if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); +#endif + Fecode++; + break; + + + /* ===================================================================== */ + /* Match a single code unit, even in UTF mode. This opcode really does + match any code unit, even newline. (It really should be called ANYCODEUNIT, + of course - the byte name is from pre-16 bit days.) */ + + case OP_ANYBYTE: + if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ + { /* not be updated before SCHECK_PARTIAL. */ + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + Feptr++; + Fecode++; + break; + + + /* ===================================================================== */ + /* Match a single character, casefully */ + + case OP_CHAR: +#ifdef SUPPORT_UNICODE + if (utf) + { + Flength = 1; + Fecode++; + GETCHARLEN(fc, Fecode, Flength); + if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) + { + CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ + RRETURN(MATCH_NOMATCH); + } + for (; Flength > 0; Flength--) + { + if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + + /* Not UTF mode */ + { + if (mb->end_subject - Feptr < 1) + { + SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ + RRETURN(MATCH_NOMATCH); + } + if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); + Fecode += 2; + } + break; + + + /* ===================================================================== */ + /* Match a single character, caselessly. If we are at the end of the + subject, give up immediately. We get here only when the pattern character + has at most one other case. Characters with more than two cases are coded + as OP_PROP with the pseudo-property PT_CLIST. */ + + case OP_CHARI: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + +#ifdef SUPPORT_UNICODE + if (utf) + { + Flength = 1; + Fecode++; + GETCHARLEN(fc, Fecode, Flength); + + /* If the pattern character's value is < 128, we know that its other case + (if any) is also < 128 (and therefore only one code unit long in all + code-unit widths), so we can use the fast lookup table. We checked above + that there is at least one character left in the subject. */ + + if (fc < 128) + { + uint32_t cc = UCHAR21(Feptr); + if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); + Fecode++; + Feptr++; + } + + /* Otherwise we must pick up the subject character and use Unicode + property support to test its other case. Note that we cannot use the + value of "Flength" to check for sufficient bytes left, because the other + case of the character may have more or fewer code units. */ + + else + { + uint32_t dc; + GETCHARINC(dc, Feptr); + Fecode += Flength; + if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); + } + } + + /* If UCP is set without UTF we must do the same as above, but with one + character per code unit. */ + + else if (ucp) + { + uint32_t cc = UCHAR21(Feptr); + fc = Fecode[1]; + if (fc < 128) + { + if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); + } + else + { + if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); + } + Feptr++; + Fecode += 2; + } + + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF or UCP mode; use the table for characters < 256. */ + { + if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) + != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); + Feptr++; + Fecode += 2; + } + break; + + + /* ===================================================================== */ + /* Match not a single character. */ + + case OP_NOT: + case OP_NOTI: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t ch; + Fecode++; + GETCHARINC(ch, Fecode); + GETCHARINC(fc, Feptr); + if (ch == fc) + { + RRETURN(MATCH_NOMATCH); /* Caseful match */ + } + else if (Fop == OP_NOTI) /* If caseless */ + { + if (ch > 127) + ch = UCD_OTHERCASE(ch); + else + ch = (mb->fcc)[ch]; + if (ch == fc) RRETURN(MATCH_NOMATCH); + } + } + + /* UCP without UTF is as above, but with one character per code unit. */ + + else if (ucp) + { + uint32_t ch; + fc = UCHAR21INC(Feptr); + ch = Fecode[1]; + Fecode += 2; + + if (ch == fc) + { + RRETURN(MATCH_NOMATCH); /* Caseful match */ + } + else if (Fop == OP_NOTI) /* If caseless */ + { + if (ch > 127) + ch = UCD_OTHERCASE(ch); + else + ch = (mb->fcc)[ch]; + if (ch == fc) RRETURN(MATCH_NOMATCH); + } + } + + else +#endif /* SUPPORT_UNICODE */ + + /* Neither UTF nor UCP is set */ + + { + uint32_t ch = Fecode[1]; + fc = UCHAR21INC(Feptr); + if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) + RRETURN(MATCH_NOMATCH); + Fecode += 2; + } + break; + + + /* ===================================================================== */ + /* Match a single character repeatedly. */ + +#define Loclength F->temp_size +#define Lstart_eptr F->temp_sptr[0] +#define Lcharptr F->temp_sptr[1] +#define Lmin F->temp_32[0] +#define Lmax F->temp_32[1] +#define Lc F->temp_32[2] +#define Loc F->temp_32[3] + + case OP_EXACT: + case OP_EXACTI: + Lmin = Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATCHAR; + + case OP_POSUPTO: + case OP_POSUPTOI: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATCHAR; + + case OP_UPTO: + case OP_UPTOI: + reptype = REPTYPE_MAX; + Lmin = 0; + Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATCHAR; + + case OP_MINUPTO: + case OP_MINUPTOI: + reptype = REPTYPE_MIN; + Lmin = 0; + Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATCHAR; + + case OP_POSSTAR: + case OP_POSSTARI: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = UINT32_MAX; + Fecode++; + goto REPEATCHAR; + + case OP_POSPLUS: + case OP_POSPLUSI: + reptype = REPTYPE_POS; + Lmin = 1; + Lmax = UINT32_MAX; + Fecode++; + goto REPEATCHAR; + + case OP_POSQUERY: + case OP_POSQUERYI: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = 1; + Fecode++; + goto REPEATCHAR; + + case OP_STAR: + case OP_STARI: + case OP_MINSTAR: + case OP_MINSTARI: + case OP_PLUS: + case OP_PLUSI: + case OP_MINPLUS: + case OP_MINPLUSI: + case OP_QUERY: + case OP_QUERYI: + case OP_MINQUERY: + case OP_MINQUERYI: + fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); + Lmin = rep_min[fc]; + Lmax = rep_max[fc]; + reptype = rep_typ[fc]; + + /* Common code for all repeated single-character matches. We first check + for the minimum number of characters. If the minimum equals the maximum, we + are done. Otherwise, if minimizing, check the rest of the pattern for a + match; if there isn't one, advance up to the maximum, one character at a + time. + + If maximizing, advance up to the maximum number of matching characters, + until Feptr is past the end of the maximum run. If possessive, we are + then done (no backing up). Otherwise, match at this position; anything + other than no match is immediately returned. For nomatch, back up one + character, unless we are matching \R and the last thing matched was + \r\n, in which case, back up two code units until we reach the first + optional character position. + + The various UTF/non-UTF and caseful/caseless cases are handled separately, + for speed. */ + + REPEATCHAR: +#ifdef SUPPORT_UNICODE + if (utf) + { + Flength = 1; + Lcharptr = Fecode; + GETCHARLEN(fc, Fecode, Flength); + Fecode += Flength; + + /* Handle multi-code-unit character matching, caseful and caseless. */ + + if (Flength > 1) + { + uint32_t othercase; + + if (Fop >= OP_STARI && /* Caseless */ + (othercase = UCD_OTHERCASE(fc)) != fc) + Loclength = PRIV(ord2utf)(othercase, Foccu); + else Loclength = 0; + + for (i = 1; i <= Lmin; i++) + { + if (Feptr <= mb->end_subject - Flength && + memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; + else if (Loclength > 0 && + Feptr <= mb->end_subject - Loclength && + memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) + Feptr += Loclength; + else + { + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + } + + if (Lmin == Lmax) continue; + + if (reptype == REPTYPE_MIN) + { + for (;;) + { + RMATCH(Fecode, RM202); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr <= mb->end_subject - Flength && + memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; + else if (Loclength > 0 && + Feptr <= mb->end_subject - Loclength && + memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) + Feptr += Loclength; + else + { + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + else /* Maximize */ + { + Lstart_eptr = Feptr; + for (i = Lmin; i < Lmax; i++) + { + if (Feptr <= mb->end_subject - Flength && + memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) + Feptr += Flength; + else if (Loclength > 0 && + Feptr <= mb->end_subject - Loclength && + memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) + Feptr += Loclength; + else + { + CHECK_PARTIAL(); + break; + } + } + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + + if (reptype != REPTYPE_POS) for(;;) + { + if (Feptr <= Lstart_eptr) break; + RMATCH(Fecode, RM203); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + BACKCHAR(Feptr); + } + } + break; /* End of repeated wide character handling */ + } + + /* Length of UTF character is 1. Put it into the preserved variable and + fall through to the non-UTF code. */ + + Lc = fc; + } + else +#endif /* SUPPORT_UNICODE */ + + /* When not in UTF mode, load a single-code-unit character. Then proceed as + above, using Unicode casing if either UTF or UCP is set. */ + + Lc = *Fecode++; + + /* Caseless comparison */ + + if (Fop >= OP_STARI) + { +#if PCRE2_CODE_UNIT_WIDTH == 8 +#ifdef SUPPORT_UNICODE + if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + else +#endif /* SUPPORT_UNICODE */ + /* Lc will be < 128 in UTF-8 mode. */ + Loc = mb->fcc[Lc]; +#else /* 16-bit & 32-bit */ +#ifdef SUPPORT_UNICODE + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); + else +#endif /* SUPPORT_UNICODE */ + Loc = TABLE_GET(Lc, mb->fcc, Lc); +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + + for (i = 1; i <= Lmin; i++) + { + uint32_t cc; /* Faster than PCRE2_UCHAR */ + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21TEST(Feptr); + if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); + Feptr++; + } + if (Lmin == Lmax) continue; + + if (reptype == REPTYPE_MIN) + { + for (;;) + { + uint32_t cc; /* Faster than PCRE2_UCHAR */ + RMATCH(Fecode, RM25); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21TEST(Feptr); + if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); + Feptr++; + } + /* Control never gets here */ + } + + else /* Maximize */ + { + Lstart_eptr = Feptr; + for (i = Lmin; i < Lmax; i++) + { + uint32_t cc; /* Faster than PCRE2_UCHAR */ + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + cc = UCHAR21TEST(Feptr); + if (Lc != cc && Loc != cc) break; + Feptr++; + } + if (reptype != REPTYPE_POS) for (;;) + { + if (Feptr == Lstart_eptr) break; + RMATCH(Fecode, RM26); + Feptr--; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + } + } + } + + /* Caseful comparisons (includes all multi-byte characters) */ + + else + { + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); + } + + if (Lmin == Lmax) continue; + + if (reptype == REPTYPE_MIN) + { + for (;;) + { + RMATCH(Fecode, RM27); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + else /* Maximize */ + { + Lstart_eptr = Feptr; + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + + if (Lc != UCHAR21TEST(Feptr)) break; + Feptr++; + } + + if (reptype != REPTYPE_POS) for (;;) + { + if (Feptr <= Lstart_eptr) break; + RMATCH(Fecode, RM28); + Feptr--; + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + } + } + } + break; + +#undef Loclength +#undef Lstart_eptr +#undef Lcharptr +#undef Lmin +#undef Lmax +#undef Lc +#undef Loc + + + /* ===================================================================== */ + /* Match a negated single one-byte character repeatedly. This is almost a + repeat of the code for a repeated single character, but I haven't found a + nice way of commoning these up that doesn't require a test of the + positive/negative option for each character match. Maybe that wouldn't add + very much to the time taken, but character matching *is* what this is all + about... */ + +#define Lstart_eptr F->temp_sptr[0] +#define Lmin F->temp_32[0] +#define Lmax F->temp_32[1] +#define Lc F->temp_32[2] +#define Loc F->temp_32[3] + + case OP_NOTEXACT: + case OP_NOTEXACTI: + Lmin = Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATNOTCHAR; + + case OP_NOTUPTO: + case OP_NOTUPTOI: + Lmin = 0; + Lmax = GET2(Fecode, 1); + reptype = REPTYPE_MAX; + Fecode += 1 + IMM2_SIZE; + goto REPEATNOTCHAR; + + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + Lmin = 0; + Lmax = GET2(Fecode, 1); + reptype = REPTYPE_MIN; + Fecode += 1 + IMM2_SIZE; + goto REPEATNOTCHAR; + + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = UINT32_MAX; + Fecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + reptype = REPTYPE_POS; + Lmin = 1; + Lmax = UINT32_MAX; + Fecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = 1; + Fecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATNOTCHAR; + + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); + Lmin = rep_min[fc]; + Lmax = rep_max[fc]; + reptype = rep_typ[fc]; + + /* Common code for all repeated single-character non-matches. */ + + REPEATNOTCHAR: + GETCHARINCTEST(Lc, Fecode); + + /* The code is duplicated for the caseless and caseful cases, for speed, + since matching characters is likely to be quite common. First, ensure the + minimum number of matches are present. If Lmin = Lmax, we are done. + Otherwise, if minimizing, keep trying the rest of the expression and + advancing one matching character if failing, up to the maximum. + Alternatively, if maximizing, find the maximum number of characters and + work backwards. */ + + if (Fop >= OP_NOTSTARI) /* Caseless */ + { +#ifdef SUPPORT_UNICODE + if ((utf || ucp) && Lc > 127) + Loc = UCD_OTHERCASE(Lc); + else +#endif /* SUPPORT_UNICODE */ + + Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t d; + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(d, Feptr); + if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF mode */ + { + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); + Feptr++; + } + } + + if (Lmin == Lmax) continue; /* Finished for exact count */ + + if (reptype == REPTYPE_MIN) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t d; + for (;;) + { + RMATCH(Fecode, RM204); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(d, Feptr); + if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); + } + } + else +#endif /*SUPPORT_UNICODE */ + + /* Not UTF mode */ + { + for (;;) + { + RMATCH(Fecode, RM29); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); + Feptr++; + } + } + /* Control never gets here */ + } + + /* Maximize case */ + + else + { + Lstart_eptr = Feptr; + +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t d; + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(d, Feptr, len); + if (Lc == d || Loc == d) break; + Feptr += len; + } + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + + if (reptype != REPTYPE_POS) for(;;) + { + if (Feptr <= Lstart_eptr) break; + RMATCH(Fecode, RM205); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + BACKCHAR(Feptr); + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF mode */ + { + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (Lc == *Feptr || Loc == *Feptr) break; + Feptr++; + } + if (reptype != REPTYPE_POS) for (;;) + { + if (Feptr == Lstart_eptr) break; + RMATCH(Fecode, RM30); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + } + } + } + } + + /* Caseful comparisons */ + + else + { +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t d; + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(d, Feptr); + if (Lc == d) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF mode */ + { + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); + } + } + + if (Lmin == Lmax) continue; + + if (reptype == REPTYPE_MIN) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t d; + for (;;) + { + RMATCH(Fecode, RM206); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(d, Feptr); + if (Lc == d) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF mode */ + { + for (;;) + { + RMATCH(Fecode, RM31); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* Maximize case */ + + else + { + Lstart_eptr = Feptr; + +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t d; + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(d, Feptr, len); + if (Lc == d) break; + Feptr += len; + } + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + + if (reptype != REPTYPE_POS) for(;;) + { + if (Feptr <= Lstart_eptr) break; + RMATCH(Fecode, RM207); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + BACKCHAR(Feptr); + } + } + else +#endif + /* Not UTF mode */ + { + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (Lc == *Feptr) break; + Feptr++; + } + if (reptype != REPTYPE_POS) for (;;) + { + if (Feptr == Lstart_eptr) break; + RMATCH(Fecode, RM32); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + } + } + } + } + break; + +#undef Lstart_eptr +#undef Lmin +#undef Lmax +#undef Lc +#undef Loc + + + /* ===================================================================== */ + /* Match a bit-mapped character class, possibly repeatedly. These opcodes + are used when all the characters in the class have values in the range + 0-255, and either the matching is caseful, or the characters are in the + range 0-127 when UTF processing is enabled. The only difference between + OP_CLASS and OP_NCLASS occurs when a data character outside the range is + encountered. */ + +#define Lmin F->temp_32[0] +#define Lmax F->temp_32[1] +#define Lstart_eptr F->temp_sptr[0] +#define Lbyte_map_address F->temp_sptr[1] +#define Lbyte_map ((unsigned char *)Lbyte_map_address) + + case OP_NCLASS: + case OP_CLASS: + { + Lbyte_map_address = Fecode + 1; /* Save for matching */ + Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ + + /* Look past the end of the item to see if there is repeat information + following. Then obey similar code to character type repeats. */ + + switch (*Fecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: + fc = *Fecode++ - OP_CRSTAR; + Lmin = rep_min[fc]; + Lmax = rep_max[fc]; + reptype = rep_typ[fc]; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + Lmin = GET2(Fecode, 1); + Lmax = GET2(Fecode, 1 + IMM2_SIZE); + if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ + reptype = rep_typ[*Fecode - OP_CRSTAR]; + Fecode += 1 + 2 * IMM2_SIZE; + break; + + default: /* No repeat follows */ + Lmin = Lmax = 1; + break; + } + + /* First, ensure the minimum number of matches are present. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + if (fc > 255) + { + if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF mode */ + { + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + fc = *Feptr++; +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (fc > 255) + { + if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else +#endif + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + + /* If Lmax == Lmin we are done. Continue with main loop. */ + + if (Lmin == Lmax) continue; + + /* If minimizing, keep testing the rest of the expression and advancing + the pointer while it matches the class. */ + + if (reptype == REPTYPE_MIN) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + for (;;) + { + RMATCH(Fecode, RM200); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + if (fc > 255) + { + if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + else +#endif + /* Not UTF mode */ + { + for (;;) + { + RMATCH(Fecode, RM23); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + fc = *Feptr++; +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (fc > 255) + { + if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else +#endif + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* If maximizing, find the longest possible run, then work backwards. */ + + else + { + Lstart_eptr = Feptr; + +#ifdef SUPPORT_UNICODE + if (utf) + { + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc > 255) + { + if (Fop == OP_CLASS) break; + } + else + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; + Feptr += len; + } + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + + for (;;) + { + RMATCH(Fecode, RM201); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ + BACKCHAR(Feptr); + } + } + else +#endif + /* Not UTF mode */ + { + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + fc = *Feptr; +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (fc > 255) + { + if (Fop == OP_CLASS) break; + } + else +#endif + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; + Feptr++; + } + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + while (Feptr >= Lstart_eptr) + { + RMATCH(Fecode, RM24); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + +#undef Lbyte_map_address +#undef Lbyte_map +#undef Lstart_eptr +#undef Lmin +#undef Lmax + + + /* ===================================================================== */ + /* Match an extended character class. In the 8-bit library, this opcode is + encountered only when UTF-8 mode mode is supported. In the 16-bit and + 32-bit libraries, codepoints greater than 255 may be encountered even when + UTF is not supported. */ + +#define Lstart_eptr F->temp_sptr[0] +#define Lxclass_data F->temp_sptr[1] +#define Lmin F->temp_32[0] +#define Lmax F->temp_32[1] + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + { + Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ + Fecode += GET(Fecode, 1); /* Advance past the item */ + + switch (*Fecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: + fc = *Fecode++ - OP_CRSTAR; + Lmin = rep_min[fc]; + Lmax = rep_max[fc]; + reptype = rep_typ[fc]; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + Lmin = GET2(Fecode, 1); + Lmax = GET2(Fecode, 1 + IMM2_SIZE); + if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ + reptype = rep_typ[*Fecode - OP_CRSTAR]; + Fecode += 1 + 2 * IMM2_SIZE; + break; + + default: /* No repeat follows */ + Lmin = Lmax = 1; + break; + } + + /* First, ensure the minimum number of matches are present. */ + + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); + } + + /* If Lmax == Lmin we can just continue with the main loop. */ + + if (Lmin == Lmax) continue; + + /* If minimizing, keep testing the rest of the expression and advancing + the pointer while it matches the class. */ + + if (reptype == REPTYPE_MIN) + { + for (;;) + { + RMATCH(Fecode, RM100); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* If maximizing, find the longest possible run, then work backwards. */ + + else + { + Lstart_eptr = Feptr; + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } +#ifdef SUPPORT_UNICODE + GETCHARLENTEST(fc, Feptr, len); +#else + fc = *Feptr; +#endif + if (!PRIV(xclass)(fc, Lxclass_data, utf)) break; + Feptr += len; + } + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + + for(;;) + { + RMATCH(Fecode, RM101); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ +#ifdef SUPPORT_UNICODE + if (utf) BACKCHAR(Feptr); +#endif + } + RRETURN(MATCH_NOMATCH); + } + + /* Control never gets here */ + } +#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ + +#undef Lstart_eptr +#undef Lxclass_data +#undef Lmin +#undef Lmax + + + /* ===================================================================== */ + /* Match various character types when PCRE2_UCP is not set. These opcodes + are not generated when PCRE2_UCP is set - instead appropriate property + tests are compiled. */ + + case OP_NOT_DIGIT: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_DIGIT: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_NOT_WHITESPACE: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_WHITESPACE: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_NOT_WORDCHAR: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_WORDCHAR: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_ANYNL: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + default: RRETURN(MATCH_NOMATCH); + + case CHAR_CR: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + } + else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; + break; + + case CHAR_LF: + break; + + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); + break; + } + Fecode++; + break; + + case OP_NOT_HSPACE: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ + default: break; + } + Fecode++; + break; + + case OP_HSPACE: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + HSPACE_CASES: break; /* Byte and multibyte cases */ + default: RRETURN(MATCH_NOMATCH); + } + Fecode++; + break; + + case OP_NOT_VSPACE: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); + default: break; + } + Fecode++; + break; + + case OP_VSPACE: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + VSPACE_CASES: break; + default: RRETURN(MATCH_NOMATCH); + } + Fecode++; + break; + + +#ifdef SUPPORT_UNICODE + + /* ===================================================================== */ + /* Check the next character by Unicode property. We will get here only + if the support is in the binary; otherwise a compile-time error occurs. */ + + case OP_PROP: + case OP_NOTPROP: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + { + const uint32_t *cp; + const ucd_record *prop = GET_UCD(fc); + BOOL notmatch = Fop == OP_NOTPROP; + + switch(Fecode[1]) + { + case PT_ANY: + if (notmatch) RRETURN(MATCH_NOMATCH); + break; + + case PT_LAMP: + if ((prop->chartype == ucp_Lu || + prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_GC: + if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_PC: + if ((Fecode[2] == prop->chartype) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SC: + if ((Fecode[2] == prop->script) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SCX: + { + BOOL ok = (Fecode[2] == prop->script || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0); + if (ok == notmatch) RRETURN(MATCH_NOMATCH); + } + break; + + /* These are specials */ + + case PT_ALNUM: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(fc) + { + HSPACE_CASES: + VSPACE_CASES: + if (notmatch) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + } + break; + + case PT_WORD: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || + fc == CHAR_UNDERSCORE) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_CLIST: + cp = PRIV(ucd_caseless_sets) + Fecode[2]; + for (;;) + { + if (fc < *cp) + { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } } + if (fc == *cp++) + { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; } + } + break; + + case PT_UCNC: + if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || + fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || + fc >= 0xe000) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_BIDICL: + if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + + case PT_BOOL: + { + BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), Fecode[2]) != 0; + if (ok == notmatch) RRETURN(MATCH_NOMATCH); + } + break; + + /* This should never occur */ + + default: + return PCRE2_ERROR_INTERNAL; + } + + Fecode += 3; + } + break; + + + /* ===================================================================== */ + /* Match an extended Unicode sequence. We will get here only if the support + is in the binary; otherwise a compile-time error occurs. */ + + case OP_EXTUNI: + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + else + { + GETCHARINCTEST(fc, Feptr); + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, + NULL); + } + CHECK_PARTIAL(); + Fecode++; + break; + +#endif /* SUPPORT_UNICODE */ + + + /* ===================================================================== */ + /* Match a single character type repeatedly. Note that the property type + does not need to be in a stack frame as it is not used within an RMATCH() + loop. */ + +#define Lstart_eptr F->temp_sptr[0] +#define Lmin F->temp_32[0] +#define Lmax F->temp_32[1] +#define Lctype F->temp_32[2] +#define Lpropvalue F->temp_32[3] + + case OP_TYPEEXACT: + Lmin = Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATTYPE; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + Lmin = 0; + Lmax = GET2(Fecode, 1); + reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; + Fecode += 1 + IMM2_SIZE; + goto REPEATTYPE; + + case OP_TYPEPOSSTAR: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = UINT32_MAX; + Fecode++; + goto REPEATTYPE; + + case OP_TYPEPOSPLUS: + reptype = REPTYPE_POS; + Lmin = 1; + Lmax = UINT32_MAX; + Fecode++; + goto REPEATTYPE; + + case OP_TYPEPOSQUERY: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = 1; + Fecode++; + goto REPEATTYPE; + + case OP_TYPEPOSUPTO: + reptype = REPTYPE_POS; + Lmin = 0; + Lmax = GET2(Fecode, 1); + Fecode += 1 + IMM2_SIZE; + goto REPEATTYPE; + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + fc = *Fecode++ - OP_TYPESTAR; + Lmin = rep_min[fc]; + Lmax = rep_max[fc]; + reptype = rep_typ[fc]; + + /* Common code for all repeated character type matches. */ + + REPEATTYPE: + Lctype = *Fecode++; /* Code for the character type */ + +#ifdef SUPPORT_UNICODE + if (Lctype == OP_PROP || Lctype == OP_NOTPROP) + { + proptype = *Fecode++; + Lpropvalue = *Fecode++; + } + else proptype = -1; +#endif + + /* First, ensure the minimum number of matches are present. Use inline + code for maximizing the speed, and do the type test once at the start + (i.e. keep it out of the loops). As there are no calls to RMATCH in the + loops, we can use an ordinary variable for "notmatch". The code for UTF + mode is separated out for tidiness, except for Unicode property tests. */ + + if (Lmin > 0) + { +#ifdef SUPPORT_UNICODE + if (proptype >= 0) /* Property tests in all modes */ + { + BOOL notmatch = Lctype == OP_NOTPROP; + switch(proptype) + { + case PT_ANY: + if (notmatch) RRETURN(MATCH_NOMATCH); + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + } + break; + + case PT_LAMP: + for (i = 1; i <= Lmin; i++) + { + int chartype; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + chartype = UCD_CHARTYPE(fc); + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_GC: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_PC: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_SC: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_SCX: + for (i = 1; i <= Lmin; i++) + { + BOOL ok; + const ucd_record *prop; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + prop = GET_UCD(fc); + ok = (prop->script == Lpropvalue || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); + if (ok == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_ALNUM: + for (i = 1; i <= Lmin; i++) + { + int category; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + category = UCD_CATEGORY(fc); + if ((category == ucp_L || category == ucp_N) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + HSPACE_CASES: + VSPACE_CASES: + if (notmatch) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) + RRETURN(MATCH_NOMATCH); + break; + } + } + break; + + case PT_WORD: + for (i = 1; i <= Lmin; i++) + { + int category; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + category = UCD_CATEGORY(fc); + if ((category == ucp_L || category == ucp_N || + fc == CHAR_UNDERSCORE) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_CLIST: + for (i = 1; i <= Lmin; i++) + { + const uint32_t *cp; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + cp = PRIV(ucd_caseless_sets) + Lpropvalue; + for (;;) + { + if (fc < *cp) + { + if (notmatch) break; + RRETURN(MATCH_NOMATCH); + } + if (fc == *cp++) + { + if (notmatch) RRETURN(MATCH_NOMATCH); + break; + } + } + } + break; + + case PT_UCNC: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || + fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || + fc >= 0xe000) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_BIDICL: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_BOOL: + for (i = 1; i <= Lmin; i++) + { + BOOL ok; + const ucd_record *prop; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + prop = GET_UCD(fc); + ok = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), Lpropvalue) != 0; + if (ok == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + + /* This should not occur */ + + default: + return PCRE2_ERROR_INTERNAL; + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (Lctype == OP_EXTUNI) + { + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + else + { + GETCHARINCTEST(fc, Feptr); + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, + mb->end_subject, utf, NULL); + } + CHECK_PARTIAL(); + } + } + else +#endif /* SUPPORT_UNICODE */ + +/* Handle all other cases in UTF mode */ + +#ifdef SUPPORT_UNICODE + if (utf) switch(Lctype) + { + case OP_ANY: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); + if (mb->partial != 0 && + Feptr + 1 >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + UCHAR21(Feptr) == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + Feptr++; + ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); + } + break; + + case OP_ALLANY: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + Feptr++; + ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); + } + break; + + case OP_ANYBYTE: + if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); + Feptr += Lmin; + break; + + case OP_ANYNL: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + switch(fc) + { + default: RRETURN(MATCH_NOMATCH); + + case CHAR_CR: + if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; + break; + + case CHAR_LF: + break; + + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); + break; + } + } + break; + + case OP_NOT_HSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + switch(fc) + { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); + default: break; + } + } + break; + + case OP_HSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + switch(fc) + { + HSPACE_CASES: break; + default: RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_NOT_VSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + switch(fc) + { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); + default: break; + } + } + break; + + case OP_VSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + switch(fc) + { + VSPACE_CASES: break; + default: RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_NOT_DIGIT: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINC(fc, Feptr); + if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_DIGIT: + for (i = 1; i <= Lmin; i++) + { + uint32_t cc; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21(Feptr); + if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + /* No need to skip more code units - we know it has only one. */ + } + break; + + case OP_NOT_WHITESPACE: + for (i = 1; i <= Lmin; i++) + { + uint32_t cc; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21(Feptr); + if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); + } + break; + + case OP_WHITESPACE: + for (i = 1; i <= Lmin; i++) + { + uint32_t cc; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21(Feptr); + if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + /* No need to skip more code units - we know it has only one. */ + } + break; + + case OP_NOT_WORDCHAR: + for (i = 1; i <= Lmin; i++) + { + uint32_t cc; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21(Feptr); + if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); + } + break; + + case OP_WORDCHAR: + for (i = 1; i <= Lmin; i++) + { + uint32_t cc; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + cc = UCHAR21(Feptr); + if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + /* No need to skip more code units - we know it has only one. */ + } + break; + + default: + return PCRE2_ERROR_INTERNAL; + } /* End switch(Lctype) */ + + else +#endif /* SUPPORT_UNICODE */ + + /* Code for the non-UTF case for minimum matching of operators other + than OP_PROP and OP_NOTPROP. */ + + switch(Lctype) + { + case OP_ANY: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); + if (mb->partial != 0 && + Feptr + 1 >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + *Feptr == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + Feptr++; + } + break; + + case OP_ALLANY: + if (Feptr > mb->end_subject - Lmin) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + Feptr += Lmin; + break; + + /* This OP_ANYBYTE case will never be reached because \C gets turned + into OP_ALLANY in non-UTF mode. Cut out the code so that coverage + reports don't complain about it's never being used. */ + +/* case OP_ANYBYTE: +* if (Feptr > mb->end_subject - Lmin) +* { +* SCHECK_PARTIAL(); +* RRETURN(MATCH_NOMATCH); +* } +* Feptr += Lmin; +* break; +*/ + case OP_ANYNL: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + switch(*Feptr++) + { + default: RRETURN(MATCH_NOMATCH); + + case CHAR_CR: + if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; + break; + + case CHAR_LF: + break; + + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#if PCRE2_CODE_UNIT_WIDTH != 8 + case 0x2028: + case 0x2029: +#endif + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); + break; + } + } + break; + + case OP_NOT_HSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + switch(*Feptr++) + { + default: break; + HSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + HSPACE_MULTIBYTE_CASES: +#endif + RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_HSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + switch(*Feptr++) + { + default: RRETURN(MATCH_NOMATCH); + HSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + HSPACE_MULTIBYTE_CASES: +#endif + break; + } + } + break; + + case OP_NOT_VSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + switch(*Feptr++) + { + VSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + VSPACE_MULTIBYTE_CASES: +#endif + RRETURN(MATCH_NOMATCH); + default: break; + } + } + break; + + case OP_VSPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + switch(*Feptr++) + { + default: RRETURN(MATCH_NOMATCH); + VSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + VSPACE_MULTIBYTE_CASES: +#endif + break; + } + } + break; + + case OP_NOT_DIGIT: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + } + break; + + case OP_DIGIT: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + } + break; + + case OP_NOT_WHITESPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + } + break; + + case OP_WHITESPACE: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + } + break; + + case OP_NOT_WORDCHAR: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + } + break; + + case OP_WORDCHAR: + for (i = 1; i <= Lmin; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + Feptr++; + } + break; + + default: + return PCRE2_ERROR_INTERNAL; + } + } + + /* If Lmin = Lmax we are done. Continue with the main loop. */ + + if (Lmin == Lmax) continue; + + /* If minimizing, we have to test the rest of the pattern before each + subsequent match. This means we cannot use a local "notmatch" variable as + in the other cases. As all 4 temporary 32-bit values in the frame are + already in use, just test the type each time. */ + + if (reptype == REPTYPE_MIN) + { +#ifdef SUPPORT_UNICODE + if (proptype >= 0) + { + switch(proptype) + { + case PT_ANY: + for (;;) + { + RMATCH(Fecode, RM208); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_LAMP: + for (;;) + { + int chartype; + RMATCH(Fecode, RM209); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + chartype = UCD_CHARTYPE(fc); + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_GC: + for (;;) + { + RMATCH(Fecode, RM210); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_PC: + for (;;) + { + RMATCH(Fecode, RM211); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_SC: + for (;;) + { + RMATCH(Fecode, RM212); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_SCX: + for (;;) + { + BOOL ok; + const ucd_record *prop; + RMATCH(Fecode, RM225); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + prop = GET_UCD(fc); + ok = (prop->script == Lpropvalue + || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); + if (ok == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_ALNUM: + for (;;) + { + int category; + RMATCH(Fecode, RM213); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + category = UCD_CATEGORY(fc); + if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + for (;;) + { + RMATCH(Fecode, RM214); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + switch(fc) + { + HSPACE_CASES: + VSPACE_CASES: + if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + } + } + /* Control never gets here */ + + case PT_WORD: + for (;;) + { + int category; + RMATCH(Fecode, RM215); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + category = UCD_CATEGORY(fc); + if ((category == ucp_L || + category == ucp_N || + fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_CLIST: + for (;;) + { + const uint32_t *cp; + RMATCH(Fecode, RM216); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + cp = PRIV(ucd_caseless_sets) + Lpropvalue; + for (;;) + { + if (fc < *cp) + { + if (Lctype == OP_NOTPROP) break; + RRETURN(MATCH_NOMATCH); + } + if (fc == *cp++) + { + if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; + } + } + } + /* Control never gets here */ + + case PT_UCNC: + for (;;) + { + RMATCH(Fecode, RM217); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || + fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || + fc >= 0xe000) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_BIDICL: + for (;;) + { + RMATCH(Fecode, RM224); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_BOOL: + for (;;) + { + BOOL ok; + const ucd_record *prop; + RMATCH(Fecode, RM223); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + prop = GET_UCD(fc); + ok = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), Lpropvalue) != 0; + if (ok == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* This should never occur */ + default: + return PCRE2_ERROR_INTERNAL; + } + } + + /* Match extended Unicode sequences. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (Lctype == OP_EXTUNI) + { + for (;;) + { + RMATCH(Fecode, RM218); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + else + { + GETCHARINCTEST(fc, Feptr); + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, + utf, NULL); + } + CHECK_PARTIAL(); + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* UTF mode for non-property testing character types. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + for (;;) + { + RMATCH(Fecode, RM219); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); + GETCHARINC(fc, Feptr); + switch(Lctype) + { + case OP_ANY: /* This is the non-NL case */ + if (mb->partial != 0 && /* Take care with CRLF partial */ + Feptr >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + fc == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + break; + + case OP_ALLANY: + case OP_ANYBYTE: + break; + + case OP_ANYNL: + switch(fc) + { + default: RRETURN(MATCH_NOMATCH); + + case CHAR_CR: + if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; + break; + + case CHAR_LF: + break; + + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC + case 0x2028: + case 0x2029: +#endif /* Not EBCDIC */ + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) + RRETURN(MATCH_NOMATCH); + break; + } + break; + + case OP_NOT_HSPACE: + switch(fc) + { + HSPACE_CASES: RRETURN(MATCH_NOMATCH); + default: break; + } + break; + + case OP_HSPACE: + switch(fc) + { + HSPACE_CASES: break; + default: RRETURN(MATCH_NOMATCH); + } + break; + + case OP_NOT_VSPACE: + switch(fc) + { + VSPACE_CASES: RRETURN(MATCH_NOMATCH); + default: break; + } + break; + + case OP_VSPACE: + switch(fc) + { + VSPACE_CASES: break; + default: RRETURN(MATCH_NOMATCH); + } + break; + + case OP_NOT_DIGIT: + if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + return PCRE2_ERROR_INTERNAL; + } + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF mode */ + { + for (;;) + { + RMATCH(Fecode, RM33); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) + RRETURN(MATCH_NOMATCH); + fc = *Feptr++; + switch(Lctype) + { + case OP_ANY: /* This is the non-NL case */ + if (mb->partial != 0 && /* Take care with CRLF partial */ + Feptr >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + fc == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + break; + + case OP_ALLANY: + case OP_ANYBYTE: + break; + + case OP_ANYNL: + switch(fc) + { + default: RRETURN(MATCH_NOMATCH); + + case CHAR_CR: + if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; + break; + + case CHAR_LF: + break; + + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#if PCRE2_CODE_UNIT_WIDTH != 8 + case 0x2028: + case 0x2029: +#endif + if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) + RRETURN(MATCH_NOMATCH); + break; + } + break; + + case OP_NOT_HSPACE: + switch(fc) + { + default: break; + HSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + HSPACE_MULTIBYTE_CASES: +#endif + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_HSPACE: + switch(fc) + { + default: RRETURN(MATCH_NOMATCH); + HSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + HSPACE_MULTIBYTE_CASES: +#endif + break; + } + break; + + case OP_NOT_VSPACE: + switch(fc) + { + default: break; + VSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + VSPACE_MULTIBYTE_CASES: +#endif + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_VSPACE: + switch(fc) + { + default: RRETURN(MATCH_NOMATCH); + VSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + VSPACE_MULTIBYTE_CASES: +#endif + break; + } + break; + + case OP_NOT_DIGIT: + if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + return PCRE2_ERROR_INTERNAL; + } + } + } + /* Control never gets here */ + } + + /* If maximizing, it is worth using inline code for speed, doing the type + test once at the start (i.e. keep it out of the loops). Once again, + "notmatch" can be an ordinary local variable because the loops do not call + RMATCH. */ + + else + { + Lstart_eptr = Feptr; /* Remember where we started */ + +#ifdef SUPPORT_UNICODE + if (proptype >= 0) + { + BOOL notmatch = Lctype == OP_NOTPROP; + switch(proptype) + { + case PT_ANY: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + if (notmatch) break; + Feptr+= len; + } + break; + + case PT_LAMP: + for (i = Lmin; i < Lmax; i++) + { + int chartype; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + chartype = UCD_CHARTYPE(fc); + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == notmatch) + break; + Feptr+= len; + } + break; + + case PT_GC: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break; + Feptr+= len; + } + break; + + case PT_PC: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break; + Feptr+= len; + } + break; + + case PT_SC: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break; + Feptr+= len; + } + break; + + case PT_SCX: + for (i = Lmin; i < Lmax; i++) + { + BOOL ok; + const ucd_record *prop; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + prop = GET_UCD(fc); + ok = (prop->script == Lpropvalue || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); + if (ok == notmatch) break; + Feptr+= len; + } + break; + + case PT_ALNUM: + for (i = Lmin; i < Lmax; i++) + { + int category; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + category = UCD_CATEGORY(fc); + if ((category == ucp_L || category == ucp_N) == notmatch) + break; + Feptr+= len; + } + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + switch(fc) + { + HSPACE_CASES: + VSPACE_CASES: + if (notmatch) goto ENDLOOP99; /* Break the loop */ + break; + + default: + if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) + goto ENDLOOP99; /* Break the loop */ + break; + } + Feptr+= len; + } + ENDLOOP99: + break; + + case PT_WORD: + for (i = Lmin; i < Lmax; i++) + { + int category; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + category = UCD_CATEGORY(fc); + if ((category == ucp_L || category == ucp_N || + fc == CHAR_UNDERSCORE) == notmatch) + break; + Feptr+= len; + } + break; + + case PT_CLIST: + for (i = Lmin; i < Lmax; i++) + { + const uint32_t *cp; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + cp = PRIV(ucd_caseless_sets) + Lpropvalue; + for (;;) + { + if (fc < *cp) + { if (notmatch) break; else goto GOT_MAX; } + if (fc == *cp++) + { if (notmatch) goto GOT_MAX; else break; } + } + Feptr += len; + } + GOT_MAX: + break; + + case PT_UCNC: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || + fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || + fc >= 0xe000) == notmatch) + break; + Feptr += len; + } + break; + + case PT_BIDICL: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break; + Feptr+= len; + } + break; + + case PT_BOOL: + for (i = Lmin; i < Lmax; i++) + { + BOOL ok; + const ucd_record *prop; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + prop = GET_UCD(fc); + ok = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), Lpropvalue) != 0; + if (ok == notmatch) break; + Feptr+= len; + } + break; + + default: + return PCRE2_ERROR_INTERNAL; + } + + /* Feptr is now past the end of the maximum run */ + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + + for(;;) + { + if (Feptr <= Lstart_eptr) break; + RMATCH(Fecode, RM222); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + if (utf) BACKCHAR(Feptr); + } + } + + /* Match extended Unicode grapheme clusters. We will get here only if the + support is in the binary; otherwise a compile-time error occurs. */ + + else if (Lctype == OP_EXTUNI) + { + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + else + { + GETCHARINCTEST(fc, Feptr); + Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, + utf, NULL); + } + CHECK_PARTIAL(); + } + + /* Feptr is now past the end of the maximum run */ + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start + of the run while backtracking because the use of \C in UTF mode can + cause BACKCHAR to move back past Lstart_eptr. This is just palliative; + the use of \C in UTF mode is fraught with danger. */ + + for(;;) + { + int lgb, rgb; + PCRE2_SPTR fptr; + + if (Feptr <= Lstart_eptr) break; /* At start of char run */ + RMATCH(Fecode, RM220); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + + /* Backtracking over an extended grapheme cluster involves inspecting + the previous two characters (if present) to see if a break is + permitted between them. */ + + Feptr--; + if (!utf) fc = *Feptr; else + { + BACKCHAR(Feptr); + GETCHAR(fc, Feptr); + } + rgb = UCD_GRAPHBREAK(fc); + + for (;;) + { + if (Feptr <= Lstart_eptr) break; /* At start of char run */ + fptr = Feptr - 1; + if (!utf) fc = *fptr; else + { + BACKCHAR(fptr); + GETCHAR(fc, fptr); + } + lgb = UCD_GRAPHBREAK(fc); + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; + Feptr = fptr; + rgb = lgb; + } + } + } + + else +#endif /* SUPPORT_UNICODE */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + switch(Lctype) + { + case OP_ANY: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (IS_NEWLINE(Feptr)) break; + if (mb->partial != 0 && /* Take care with CRLF partial */ + Feptr + 1 >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + UCHAR21(Feptr) == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + Feptr++; + ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); + } + break; + + case OP_ALLANY: + if (Lmax < UINT32_MAX) + { + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + Feptr++; + ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); + } + } + else + { + Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ + SCHECK_PARTIAL(); + } + break; + + /* The "byte" (i.e. "code unit") case is the same as non-UTF */ + + case OP_ANYBYTE: + fc = Lmax - Lmin; + if (fc > (uint32_t)(mb->end_subject - Feptr)) + { + Feptr = mb->end_subject; + SCHECK_PARTIAL(); + } + else Feptr += fc; + break; + + case OP_ANYNL: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc == CHAR_CR) + { + if (++Feptr >= mb->end_subject) break; + if (UCHAR21(Feptr) == CHAR_LF) Feptr++; + } + else + { + if (fc != CHAR_LF && + (mb->bsr_convention == PCRE2_BSR_ANYCRLF || + (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL +#ifndef EBCDIC + && fc != 0x2028 && fc != 0x2029 +#endif /* Not EBCDIC */ + ))) + break; + Feptr += len; + } + } + break; + + case OP_NOT_HSPACE: + case OP_HSPACE: + for (i = Lmin; i < Lmax; i++) + { + BOOL gotspace; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + switch(fc) + { + HSPACE_CASES: gotspace = TRUE; break; + default: gotspace = FALSE; break; + } + if (gotspace == (Lctype == OP_NOT_HSPACE)) break; + Feptr += len; + } + break; + + case OP_NOT_VSPACE: + case OP_VSPACE: + for (i = Lmin; i < Lmax; i++) + { + BOOL gotspace; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + switch(fc) + { + VSPACE_CASES: gotspace = TRUE; break; + default: gotspace = FALSE; break; + } + if (gotspace == (Lctype == OP_NOT_VSPACE)) break; + Feptr += len; + } + break; + + case OP_NOT_DIGIT: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; + Feptr+= len; + } + break; + + case OP_DIGIT: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; + Feptr+= len; + } + break; + + case OP_NOT_WHITESPACE: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; + Feptr+= len; + } + break; + + case OP_WHITESPACE: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; + Feptr+= len; + } + break; + + case OP_NOT_WORDCHAR: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; + Feptr+= len; + } + break; + + case OP_WORDCHAR: + for (i = Lmin; i < Lmax; i++) + { + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLEN(fc, Feptr, len); + if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; + Feptr+= len; + } + break; + + default: + return PCRE2_ERROR_INTERNAL; + } + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go + too far. */ + + for(;;) + { + if (Feptr <= Lstart_eptr) break; + RMATCH(Fecode, RM221); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + BACKCHAR(Feptr); + if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && + UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) + Feptr--; + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF mode */ + { + switch(Lctype) + { + case OP_ANY: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (IS_NEWLINE(Feptr)) break; + if (mb->partial != 0 && /* Take care with CRLF partial */ + Feptr + 1 >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + *Feptr == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + Feptr++; + } + break; + + case OP_ALLANY: + case OP_ANYBYTE: + fc = Lmax - Lmin; + if (fc > (uint32_t)(mb->end_subject - Feptr)) + { + Feptr = mb->end_subject; + SCHECK_PARTIAL(); + } + else Feptr += fc; + break; + + case OP_ANYNL: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + fc = *Feptr; + if (fc == CHAR_CR) + { + if (++Feptr >= mb->end_subject) break; + if (*Feptr == CHAR_LF) Feptr++; + } + else + { + if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || + (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL +#if PCRE2_CODE_UNIT_WIDTH != 8 + && fc != 0x2028 && fc != 0x2029 +#endif + ))) break; + Feptr++; + } + } + break; + + case OP_NOT_HSPACE: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + switch(*Feptr) + { + default: Feptr++; break; + HSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + HSPACE_MULTIBYTE_CASES: +#endif + goto ENDLOOP00; + } + } + ENDLOOP00: + break; + + case OP_HSPACE: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + switch(*Feptr) + { + default: goto ENDLOOP01; + HSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + HSPACE_MULTIBYTE_CASES: +#endif + Feptr++; break; + } + } + ENDLOOP01: + break; + + case OP_NOT_VSPACE: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + switch(*Feptr) + { + default: Feptr++; break; + VSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + VSPACE_MULTIBYTE_CASES: +#endif + goto ENDLOOP02; + } + } + ENDLOOP02: + break; + + case OP_VSPACE: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + switch(*Feptr) + { + default: goto ENDLOOP03; + VSPACE_BYTE_CASES: +#if PCRE2_CODE_UNIT_WIDTH != 8 + VSPACE_MULTIBYTE_CASES: +#endif + Feptr++; break; + } + } + ENDLOOP03: + break; + + case OP_NOT_DIGIT: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) + break; + Feptr++; + } + break; + + case OP_DIGIT: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) + break; + Feptr++; + } + break; + + case OP_NOT_WHITESPACE: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) + break; + Feptr++; + } + break; + + case OP_WHITESPACE: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) + break; + Feptr++; + } + break; + + case OP_NOT_WORDCHAR: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) + break; + Feptr++; + } + break; + + case OP_WORDCHAR: + for (i = Lmin; i < Lmax; i++) + { + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) + break; + Feptr++; + } + break; + + default: + return PCRE2_ERROR_INTERNAL; + } + + if (reptype == REPTYPE_POS) continue; /* No backtracking */ + + for (;;) + { + if (Feptr == Lstart_eptr) break; + RMATCH(Fecode, RM34); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr--; + if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && + Feptr[-1] == CHAR_CR) Feptr--; + } + } + } + break; /* End of repeat character type processing */ + +#undef Lstart_eptr +#undef Lmin +#undef Lmax +#undef Lctype +#undef Lpropvalue + + + /* ===================================================================== */ + /* Match a back reference, possibly repeatedly. Look past the end of the + item to see if there is repeat information following. The OP_REF and + OP_REFI opcodes are used for a reference to a numbered group or to a + non-duplicated named group. For a duplicated named group, OP_DNREF and + OP_DNREFI are used. In this case we must scan the list of groups to which + the name refers, and use the first one that is set. */ + +#define Lmin F->temp_32[0] +#define Lmax F->temp_32[1] +#define Lcaseless F->temp_32[2] +#define Lstart F->temp_sptr[0] +#define Loffset F->temp_size + + case OP_DNREF: + case OP_DNREFI: + Lcaseless = (Fop == OP_DNREFI); + { + int count = GET2(Fecode, 1+IMM2_SIZE); + PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; + Fecode += 1 + 2*IMM2_SIZE; + + while (count-- > 0) + { + Loffset = (GET2(slot, 0) << 1) - 2; + if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; + slot += mb->name_entry_size; + } + } + goto REF_REPEAT; + + case OP_REF: + case OP_REFI: + Lcaseless = (Fop == OP_REFI); + Loffset = (GET2(Fecode, 1) << 1) - 2; + Fecode += 1 + IMM2_SIZE; + + /* Set up for repetition, or handle the non-repeated case. The maximum and + minimum must be in the heap frame, but as they are short-term values, we + use temporary fields. */ + + REF_REPEAT: + switch (*Fecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + fc = *Fecode++ - OP_CRSTAR; + Lmin = rep_min[fc]; + Lmax = rep_max[fc]; + reptype = rep_typ[fc]; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + Lmin = GET2(Fecode, 1); + Lmax = GET2(Fecode, 1 + IMM2_SIZE); + reptype = rep_typ[*Fecode - OP_CRSTAR]; + if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ + Fecode += 1 + 2 * IMM2_SIZE; + break; + + default: /* No repeat follows */ + { + rrc = match_ref(Loffset, Lcaseless, F, mb, &length); + if (rrc != 0) + { + if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + } + Feptr += length; + continue; /* With the main loop */ + } + + /* Handle repeated back references. If a set group has length zero, just + continue with the main loop, because it matches however many times. For an + unset reference, if the minimum is zero, we can also just continue. We can + also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset + group behave as a zero-length group. For any other unset cases, carrying + on will result in NOMATCH. */ + + if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) + { + if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; + } + else /* Group is not set */ + { + if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) + continue; + } + + /* First, ensure the minimum number of matches are present. */ + + for (i = 1; i <= Lmin; i++) + { + PCRE2_SIZE slength; + rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); + if (rrc != 0) + { + if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + Feptr += slength; + } + + /* If min = max, we are done. They are not both allowed to be zero. */ + + if (Lmin == Lmax) continue; + + /* If minimizing, keep trying and advancing the pointer. */ + + if (reptype == REPTYPE_MIN) + { + for (;;) + { + PCRE2_SIZE slength; + RMATCH(Fecode, RM20); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); + if (rrc != 0) + { + if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ + CHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + Feptr += slength; + } + /* Control never gets here */ + } + + /* If maximizing, find the longest string and work backwards, as long as + the matched lengths for each iteration are the same. */ + + else + { + BOOL samelengths = TRUE; + Lstart = Feptr; /* Starting position */ + Flength = Fovector[Loffset+1] - Fovector[Loffset]; + + for (i = Lmin; i < Lmax; i++) + { + PCRE2_SIZE slength; + rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); + if (rrc != 0) + { + /* Can't use CHECK_PARTIAL because we don't want to update Feptr in + the soft partial matching case. */ + + if (rrc > 0 && mb->partial != 0 && + mb->end_subject > mb->start_used_ptr) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + break; + } + + if (slength != Flength) samelengths = FALSE; + Feptr += slength; + } + + /* If the length matched for each repetition is the same as the length of + the captured group, we can easily work backwards. This is the normal + case. However, in caseless UTF-8 mode there are pairs of case-equivalent + characters whose lengths (in terms of code units) differ. However, this + is very rare, so we handle it by re-matching fewer and fewer times. */ + + if (samelengths) + { + while (Feptr >= Lstart) + { + RMATCH(Fecode, RM21); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Feptr -= Flength; + } + } + + /* The rare case of non-matching lengths. Re-scan the repetition for each + iteration. We know that match_ref() will succeed every time. */ + + else + { + Lmax = i; + for (;;) + { + RMATCH(Fecode, RM22); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Feptr == Lstart) break; /* Failed after minimal repetition */ + Feptr = Lstart; + Lmax--; + for (i = Lmin; i < Lmax; i++) + { + PCRE2_SIZE slength; + (void)match_ref(Loffset, Lcaseless, F, mb, &slength); + Feptr += slength; + } + } + } + + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + +#undef Lcaseless +#undef Lmin +#undef Lmax +#undef Lstart +#undef Loffset + + + +/* ========================================================================= */ +/* Opcodes for the start of various parenthesized items */ +/* ========================================================================= */ + + /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the + (*THEN) is within the current branch by comparing the address of OP_THEN + that is passed back with the end of the branch. If (*THEN) is within the + current branch, and the branch is one of two or more alternatives (it + either starts or ends with OP_ALT), we have reached the limit of THEN's + action, so convert the return code to NOMATCH, which will cause normal + backtracking to happen from now on. Otherwise, THEN is passed back to an + outer alternative. This implements Perl's treatment of parenthesized + groups, where a group not containing | does not affect the current + alternative, that is, (X) is NOT the same as (X|(*F)). */ + + + /* ===================================================================== */ + /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive + bracket group, indicating that it may occur zero times. It may repeat + infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in + the pattern. Brackets with fixed upper repeat limits are compiled as a + number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. + Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ + +#define Lnext_ecode F->temp_sptr[0] + + case OP_BRAZERO: + Lnext_ecode = Fecode + 1; + RMATCH(Lnext_ecode, RM9); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); + Fecode = Lnext_ecode + 1 + LINK_SIZE; + break; + + case OP_BRAMINZERO: + Lnext_ecode = Fecode + 1; + do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); + RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Fecode++; + break; + +#undef Lnext_ecode + + case OP_SKIPZERO: + Fecode++; + do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); + Fecode += 1 + LINK_SIZE; + break; + + + /* ===================================================================== */ + /* Handle possessive brackets with an unlimited repeat. The end of these + brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without + going further in the pattern. */ + +#define Lframe_type F->temp_32[0] +#define Lmatched_once F->temp_32[1] +#define Lzero_allowed F->temp_32[2] +#define Lstart_eptr F->temp_sptr[0] +#define Lstart_group F->temp_sptr[1] + + case OP_BRAPOSZERO: + Lzero_allowed = TRUE; /* Zero repeat is allowed */ + Fecode += 1; + if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) + goto POSSESSIVE_CAPTURE; + goto POSSESSIVE_NON_CAPTURE; + + case OP_BRAPOS: + case OP_SBRAPOS: + Lzero_allowed = FALSE; /* Zero repeat not allowed */ + + POSSESSIVE_NON_CAPTURE: + Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ + goto POSSESSIVE_GROUP; + + case OP_CBRAPOS: + case OP_SCBRAPOS: + Lzero_allowed = FALSE; /* Zero repeat not allowed */ + + POSSESSIVE_CAPTURE: + number = GET2(Fecode, 1+LINK_SIZE); + Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ + + POSSESSIVE_GROUP: + Lmatched_once = FALSE; /* Never matched */ + Lstart_group = Fecode; /* Start of this group */ + + for (;;) + { + Lstart_eptr = Feptr; /* Position at group start */ + group_frame_type = Lframe_type; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); + if (rrc == MATCH_KETRPOS) + { + Lmatched_once = TRUE; /* Matched at least once */ + if (Feptr == Lstart_eptr) /* Empty match; skip to end */ + { + do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); + break; + } + + Fecode = Lstart_group; + continue; + } + + /* See comment above about handling THEN. */ + + if (rrc == MATCH_THEN) + { + PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); + if (mb->verb_ecode_ptr < next_ecode && + (*Fecode == OP_ALT || *next_ecode == OP_ALT)) + rrc = MATCH_NOMATCH; + } + + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Fecode += GET(Fecode, 1); + if (*Fecode != OP_ALT) break; + } + + /* Success if matched something or zero repeat allowed */ + + if (Lmatched_once || Lzero_allowed) + { + Fecode += 1 + LINK_SIZE; + break; + } + + RRETURN(MATCH_NOMATCH); + +#undef Lmatched_once +#undef Lzero_allowed +#undef Lframe_type +#undef Lstart_eptr +#undef Lstart_group + + + /* ===================================================================== */ + /* Handle non-capturing brackets that cannot match an empty string. When we + get to the final alternative within the brackets, as long as there are no + THEN's in the pattern, we can optimize by not recording a new backtracking + point. (Ideally we should test for a THEN within this group, but we don't + have that information.) Don't do this if we are at the very top level, + however, because that would make handling assertions and once-only brackets + messier when there is nothing to go back to. */ + +#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ +#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ + + case OP_BRA: + if (mb->hasthen || Frdepth == 0) + { + Lframe_type = 0; + goto GROUPLOOP; + } + + for (;;) + { + Lnext_branch = Fecode + GET(Fecode, 1); + if (*Lnext_branch != OP_ALT) break; + + /* This is never the final branch. We do not need to test for MATCH_THEN + here because this code is not used when there is a THEN in the pattern. */ + + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Fecode = Lnext_branch; + } + + /* Hit the start of the final branch. Continue at this level. */ + + Fecode += PRIV(OP_lengths)[*Fecode]; + break; + +#undef Lnext_branch + + + /* ===================================================================== */ + /* Handle a capturing bracket, other than those that are possessive with an + unlimited repeat. */ + + case OP_CBRA: + case OP_SCBRA: + Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); + goto GROUPLOOP; + + + /* ===================================================================== */ + /* Atomic groups and non-capturing brackets that can match an empty string + must record a backtracking point and also set up a chained frame. */ + + case OP_ONCE: + case OP_SCRIPT_RUN: + case OP_SBRA: + Lframe_type = GF_NOCAPTURE | Fop; + + GROUPLOOP: + for (;;) + { + group_frame_type = Lframe_type; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); + if (rrc == MATCH_THEN) + { + PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); + if (mb->verb_ecode_ptr < next_ecode && + (*Fecode == OP_ALT || *next_ecode == OP_ALT)) + rrc = MATCH_NOMATCH; + } + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Fecode += GET(Fecode, 1); + if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); + } + /* Control never reaches here. */ + +#undef Lframe_type + + + /* ===================================================================== */ + /* Recursion either matches the current regex, or some subexpression. The + offset data is the offset to the starting bracket from the start of the + whole pattern. (This is so that it works from duplicated subpatterns.) */ + +#define Lframe_type F->temp_32[0] +#define Lstart_branch F->temp_sptr[0] + + case OP_RECURSE: + bracode = mb->start_code + GET(Fecode, 1); + number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); + + /* If we are already in a recursion, check for repeating the same one + without advancing the subject pointer. This should catch convoluted mutual + recursions. (Some simple cases are caught at compile time.) */ + + if (Fcurrent_recurse != RECURSE_UNSET) + { + offset = Flast_group_offset; + while (offset != PCRE2_UNSET) + { + N = (heapframe *)((char *)match_data->heapframes + offset); + P = (heapframe *)((char *)N - frame_size); + if (N->group_frame_type == (GF_RECURSE | number)) + { + if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP; + break; + } + offset = P->last_group_offset; + } + } + + /* Now run the recursion, branch by branch. */ + + Lstart_branch = bracode; + Lframe_type = GF_RECURSE | number; + + for (;;) + { + PCRE2_SPTR next_ecode; + + group_frame_type = Lframe_type; + RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); + next_ecode = Lstart_branch + GET(Lstart_branch,1); + + /* Handle backtracking verbs, which are defined in a range that can + easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to + escape beyond a recursion; they cause a NOMATCH for the entire recursion. + + When one of these verbs triggers, the current recursion group number is + recorded. If it matches the recursion we are processing, the verb + happened within the recursion and we must deal with it. Otherwise it must + have happened after the recursion completed, and so has to be passed + back. See comment above about handling THEN. */ + + if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && + mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) + { + if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && + (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) + rrc = MATCH_NOMATCH; + else RRETURN(MATCH_NOMATCH); + } + + /* Note that carrying on after (*ACCEPT) in a recursion is handled in the + OP_ACCEPT code. Nothing needs to be done here. */ + + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Lstart_branch = next_ecode; + if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); + } + /* Control never reaches here. */ + +#undef Lframe_type +#undef Lstart_branch + + + /* ===================================================================== */ + /* Positive assertions are like other groups except that PCRE doesn't allow + the effect of (*THEN) to escape beyond an assertion; it is therefore + treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its + captures and mark retained. Any other return is an error. */ + +#define Lframe_type F->temp_32[0] + + case OP_ASSERT: + case OP_ASSERTBACK: + case OP_ASSERT_NA: + case OP_ASSERTBACK_NA: + Lframe_type = GF_NOCAPTURE | Fop; + for (;;) + { + group_frame_type = Lframe_type; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); + if (rrc == MATCH_ACCEPT) + { + memcpy(Fovector, + (char *)assert_accept_frame + offsetof(heapframe, ovector), + assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); + Foffset_top = assert_accept_frame->offset_top; + Fmark = assert_accept_frame->mark; + break; + } + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + Fecode += GET(Fecode, 1); + if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); + } + + do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); + Fecode += 1 + LINK_SIZE; + break; + +#undef Lframe_type + + + /* ===================================================================== */ + /* Handle negative assertions. Loop for each non-matching branch as for + positive assertions. */ + +#define Lframe_type F->temp_32[0] + + case OP_ASSERT_NOT: + case OP_ASSERTBACK_NOT: + Lframe_type = GF_NOCAPTURE | Fop; + + for (;;) + { + group_frame_type = Lframe_type; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); + switch(rrc) + { + case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ + case MATCH_MATCH: + RRETURN (MATCH_NOMATCH); + + case MATCH_NOMATCH: /* Branch failed, try next if present. */ + case MATCH_THEN: + Fecode += GET(Fecode, 1); + if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; + break; + + case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ + case MATCH_SKIP: + case MATCH_PRUNE: + do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); + goto ASSERT_NOT_FAILED; + + default: /* Pass back any other return */ + RRETURN(rrc); + } + } + + /* None of the branches have matched or there was a backtrack to (*COMMIT), + (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a + negative assertion, so carry on. */ + + ASSERT_NOT_FAILED: + Fecode += 1 + LINK_SIZE; + break; + +#undef Lframe_type + + + /* ===================================================================== */ + /* The callout item calls an external function, if one is provided, passing + details of the match so far. This is mainly for debugging, though the + function is able to force a failure. */ + + case OP_CALLOUT: + case OP_CALLOUT_STR: + rrc = do_callout(F, mb, &length); + if (rrc > 0) RRETURN(MATCH_NOMATCH); + if (rrc < 0) RRETURN(rrc); + Fecode += length; + break; + + + /* ===================================================================== */ + /* Conditional group: compilation checked that there are no more than two + branches. If the condition is false, skipping the first branch takes us + past the end of the item if there is only one branch, but that's exactly + what we want. */ + + case OP_COND: + case OP_SCOND: + + /* The variable Flength will be added to Fecode when the condition is + false, to get to the second branch. Setting it to the offset to the ALT or + KET, then incrementing Fecode achieves this effect. However, if the second + branch is non-existent, we must point to the KET so that the end of the + group is correctly processed. We now have Fecode pointing to the condition + or callout. */ + + Flength = GET(Fecode, 1); /* Offset to the second branch */ + if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; + Fecode += 1 + LINK_SIZE; /* From this opcode */ + + /* Because of the way auto-callout works during compile, a callout item is + inserted between OP_COND and an assertion condition. Such a callout can + also be inserted manually. */ + + if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) + { + rrc = do_callout(F, mb, &length); + if (rrc > 0) RRETURN(MATCH_NOMATCH); + if (rrc < 0) RRETURN(rrc); + + /* Advance Fecode past the callout, so it now points to the condition. We + must adjust Flength so that the value of Fecode+Flength is unchanged. */ + + Fecode += length; + Flength -= length; + } + + /* Test the various possible conditions */ + + condition = FALSE; + switch(*Fecode) + { + case OP_RREF: /* Group recursion test */ + if (Fcurrent_recurse != RECURSE_UNSET) + { + number = GET2(Fecode, 1); + condition = (number == RREF_ANY || number == Fcurrent_recurse); + } + break; + + case OP_DNRREF: /* Duplicate named group recursion test */ + if (Fcurrent_recurse != RECURSE_UNSET) + { + int count = GET2(Fecode, 1 + IMM2_SIZE); + PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; + while (count-- > 0) + { + number = GET2(slot, 0); + condition = number == Fcurrent_recurse; + if (condition) break; + slot += mb->name_entry_size; + } + } + break; + + case OP_CREF: /* Numbered group used test */ + offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ + condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; + break; + + case OP_DNCREF: /* Duplicate named group used test */ + { + int count = GET2(Fecode, 1 + IMM2_SIZE); + PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; + while (count-- > 0) + { + offset = (GET2(slot, 0) << 1) - 2; + condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; + if (condition) break; + slot += mb->name_entry_size; + } + } + break; + + case OP_FALSE: + case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ + break; + + case OP_TRUE: + condition = TRUE; + break; + + /* The condition is an assertion. Run code similar to the assertion code + above. */ + +#define Lpositive F->temp_32[0] +#define Lstart_branch F->temp_sptr[0] + + default: + Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); + Lstart_branch = Fecode; + + for (;;) + { + group_frame_type = GF_CONDASSERT | *Fecode; + RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); + + switch(rrc) + { + case MATCH_ACCEPT: /* Save captures */ + memcpy(Fovector, + (char *)assert_accept_frame + offsetof(heapframe, ovector), + assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); + Foffset_top = assert_accept_frame->offset_top; + + /* Fall through */ + /* In the case of a match, the captures have already been put into + the current frame. */ + + case MATCH_MATCH: + condition = Lpositive; /* TRUE for positive assertion */ + break; + + /* PCRE doesn't allow the effect of (*THEN) to escape beyond an + assertion; it is therefore always treated as NOMATCH. */ + + case MATCH_NOMATCH: + case MATCH_THEN: + Lstart_branch += GET(Lstart_branch, 1); + if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ + condition = !Lpositive; /* TRUE for negative assertion */ + break; + + /* These force no match without checking other branches. */ + + case MATCH_COMMIT: + case MATCH_SKIP: + case MATCH_PRUNE: + condition = !Lpositive; + break; + + default: + RRETURN(rrc); + } + break; /* Out of the branch loop */ + } + + /* If the condition is true, find the end of the assertion so that + advancing past it gets us to the start of the first branch. */ + + if (condition) + { + do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); + } + break; /* End of assertion condition */ + } + +#undef Lpositive +#undef Lstart_branch + + /* Choose branch according to the condition. */ + + Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; + + /* If the opcode is OP_SCOND it means we are at a repeated conditional + group that might match an empty string. We must therefore descend a level + so that the start is remembered for checking. For OP_COND we can just + continue at this level. */ + + if (Fop == OP_SCOND) + { + group_frame_type = GF_NOCAPTURE | Fop; + RMATCH(Fecode, RM35); + RRETURN(rrc); + } + break; + + + +/* ========================================================================= */ +/* End of start of parenthesis opcodes */ +/* ========================================================================= */ + + + /* ===================================================================== */ + /* Move the subject pointer back. This occurs only at the start of each + branch of a lookbehind assertion. If we are too close to the start to move + back, fail. When working with UTF-8 we move back a number of characters, + not bytes. */ + + case OP_REVERSE: + number = GET(Fecode, 1); +#ifdef SUPPORT_UNICODE + if (utf) + { + while (number-- > 0) + { + if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); + Feptr--; + BACKCHAR(Feptr); + } + } + else +#endif + + /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */ + + { + if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); + Feptr -= number; + } + + /* Save the earliest consulted character, then skip to next opcode */ + + if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; + Fecode += 1 + LINK_SIZE; + break; + + + /* ===================================================================== */ + /* An alternation is the end of a branch; scan along to find the end of the + bracketed group. */ + + case OP_ALT: + do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); + break; + + + /* ===================================================================== */ + /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the + starting frame was added to the chained frames in order to remember the + starting subject position for the group. */ + + case OP_KET: + case OP_KETRMIN: + case OP_KETRMAX: + case OP_KETRPOS: + + bracode = Fecode - GET(Fecode, 1); + + /* Point N to the frame at the start of the most recent group. + Remember the subject pointer at the start of the group. */ + + if (*bracode != OP_BRA && *bracode != OP_COND) + { + N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset); + P = (heapframe *)((char *)N - frame_size); + Flast_group_offset = P->last_group_offset; + +#ifdef DEBUG_SHOW_RMATCH + fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n", + N->rdepth, N->group_frame_type, + (char *)P->eptr - (char *)mb->start_subject); +#endif + + /* If we are at the end of an assertion that is a condition, return a + match, discarding any intermediate backtracking points. Copy back the + mark setting and the captures into the frame before N so that they are + set on return. Doing this for all assertions, both positive and negative, + seems to match what Perl does. */ + + if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) + { + memcpy((char *)P + offsetof(heapframe, ovector), Fovector, + Foffset_top * sizeof(PCRE2_SIZE)); + P->offset_top = Foffset_top; + P->mark = Fmark; + Fback_frame = (char *)F - (char *)P; + RRETURN(MATCH_MATCH); + } + } + else P = NULL; /* Indicates starting frame not recorded */ + + /* The group was not a conditional assertion. */ + + switch (*bracode) + { + case OP_BRA: /* No need to do anything for these */ + case OP_COND: + case OP_SCOND: + break; + + /* Non-atomic positive assertions are like OP_BRA, except that the + subject pointer must be put back to where it was at the start of the + assertion. */ + + case OP_ASSERT_NA: + case OP_ASSERTBACK_NA: + if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; + Feptr = P->eptr; + break; + + /* Atomic positive assertions are like OP_ONCE, except that in addition + the subject pointer must be put back to where it was at the start of the + assertion. */ + + case OP_ASSERT: + case OP_ASSERTBACK: + if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; + Feptr = P->eptr; + /* Fall through */ + + /* For an atomic group, discard internal backtracking points. We must + also ensure that any remaining branches within the top-level of the group + are not tried. Do this by adjusting the code pointer within the backtrack + frame so that it points to the final branch. */ + + case OP_ONCE: + Fback_frame = ((char *)F - (char *)P); + for (;;) + { + uint32_t y = GET(P->ecode,1); + if ((P->ecode)[y] != OP_ALT) break; + P->ecode += y; + } + break; + + /* A matching negative assertion returns MATCH, which is turned into + NOMATCH at the assertion level. */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK_NOT: + RRETURN(MATCH_MATCH); + + /* At the end of a script run, apply the script-checking rules. This code + will never by exercised if Unicode support it not compiled, because in + that environment script runs cause an error at compile time. */ + + case OP_SCRIPT_RUN: + if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); + break; + + /* Whole-pattern recursion is coded as a recurse into group 0, so it + won't be picked up here. Instead, we catch it when the OP_END is reached. + Other recursion is handled here. */ + + case OP_CBRA: + case OP_CBRAPOS: + case OP_SCBRA: + case OP_SCBRAPOS: + number = GET2(bracode, 1+LINK_SIZE); + + /* Handle a recursively called group. We reinstate the previous set of + captures and then carry on after the recursion call. */ + + if (Fcurrent_recurse == number) + { + P = (heapframe *)((char *)N - frame_size); + memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, + P->offset_top * sizeof(PCRE2_SIZE)); + Foffset_top = P->offset_top; + Fcapture_last = P->capture_last; + Fcurrent_recurse = P->current_recurse; + Fecode = P->ecode + 1 + LINK_SIZE; + continue; /* With next opcode */ + } + + /* Deal with actual capturing. */ + + offset = (number << 1) - 2; + Fcapture_last = number; + Fovector[offset] = P->eptr - mb->start_subject; + Fovector[offset+1] = Feptr - mb->start_subject; + if (offset >= Foffset_top) Foffset_top = offset + 2; + break; + } /* End actions relating to the starting opcode */ + + /* OP_KETRPOS is a possessive repeating ket. Remember the current position, + and return the MATCH_KETRPOS. This makes it possible to do the repeats one + at a time from the outer level. This must precede the empty string test - + in this case that test is done at the outer level. */ + + if (*Fecode == OP_KETRPOS) + { + memcpy((char *)P + offsetof(heapframe, eptr), + (char *)F + offsetof(heapframe, eptr), + frame_copy_size); + RRETURN(MATCH_KETRPOS); + } + + /* Handle the different kinds of closing brackets. A non-repeating ket + needs no special action, just continuing at this level. This also happens + for the repeating kets if the group matched no characters, in order to + forcibly break infinite loops. Otherwise, the repeating kets try the rest + of the pattern or restart from the preceding bracket, in the appropriate + order. */ + + if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) + { + if (Fop == OP_KETRMIN) + { + RMATCH(Fecode + 1 + LINK_SIZE, RM6); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + Fecode -= GET(Fecode, 1); + break; /* End of ket processing */ + } + + /* Repeat the maximum number of times (KETRMAX) */ + + RMATCH(bracode, RM7); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + } + + /* Carry on at this level for a non-repeating ket, or after matching an + empty string, or after repeating for a maximum number of times. */ + + Fecode += 1 + LINK_SIZE; + break; + + + /* ===================================================================== */ + /* Start and end of line assertions, not multiline mode. */ + + case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ + if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + case OP_SOD: /* Unconditional start of subject */ + if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + /* When PCRE2_NOTEOL is unset, assert before the subject end, or a + terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ + + case OP_DOLL: + if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); + if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; + + /* Fall through */ + /* Unconditional end of subject assertion (\z) */ + + case OP_EOD: + if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); + if (mb->partial != 0) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + Fecode++; + break; + + /* End of subject or ending \n assertion (\Z) */ + + case OP_EODN: + ASSERT_NL_OR_EOS: + if (Feptr < mb->end_subject && + (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen)) + { + if (mb->partial != 0 && + Feptr + 1 >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + RRETURN(MATCH_NOMATCH); + } + + /* Either at end of string or \n before end. */ + + if (mb->partial != 0) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + Fecode++; + break; + + + /* ===================================================================== */ + /* Start and end of line assertions, multiline mode. */ + + /* Start of subject unless notbol, or after any newline except for one at + the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ + + case OP_CIRCM: + if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) + RRETURN(MATCH_NOMATCH); + if (Feptr != mb->start_subject && + ((Feptr == mb->end_subject && + (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || + !WAS_NEWLINE(Feptr))) + RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + /* Assert before any newline, or before end of subject unless noteol is + set. */ + + case OP_DOLLM: + if (Feptr < mb->end_subject) + { + if (!IS_NEWLINE(Feptr)) + { + if (mb->partial != 0 && + Feptr + 1 >= mb->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } + RRETURN(MATCH_NOMATCH); + } + } + else + { + if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); + SCHECK_PARTIAL(); + } + Fecode++; + break; + + + /* ===================================================================== */ + /* Start of match assertion */ + + case OP_SOM: + if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); + Fecode++; + break; + + + /* ===================================================================== */ + /* Reset the start of match point */ + + case OP_SET_SOM: + Fstart_match = Feptr; + Fecode++; + break; + + + /* ===================================================================== */ + /* Word boundary assertions. Find out if the previous and current + characters are "word" characters. It takes a bit more work in UTF mode. + Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is + not set. When it is set, use Unicode properties if available, even when not + in UTF mode. Remember the earliest and latest consulted characters. */ + + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + if (Feptr == mb->check_subject) prev_is_word = FALSE; else + { + PCRE2_SPTR lastptr = Feptr - 1; +#ifdef SUPPORT_UNICODE + if (utf) + { + BACKCHAR(lastptr); + GETCHAR(fc, lastptr); + } + else +#endif /* SUPPORT_UNICODE */ + fc = *lastptr; + if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; +#ifdef SUPPORT_UNICODE + if ((mb->poptions & PCRE2_UCP) != 0) + { + if (fc == '_') prev_is_word = TRUE; else + { + int cat = UCD_CATEGORY(fc); + prev_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif /* SUPPORT_UNICODE */ + prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; + } + + /* Get status of next character */ + + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + cur_is_word = FALSE; + } + else + { + PCRE2_SPTR nextptr = Feptr + 1; +#ifdef SUPPORT_UNICODE + if (utf) + { + FORWARDCHARTEST(nextptr, mb->end_subject); + GETCHAR(fc, Feptr); + } + else +#endif /* SUPPORT_UNICODE */ + fc = *Feptr; + if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; +#ifdef SUPPORT_UNICODE + if ((mb->poptions & PCRE2_UCP) != 0) + { + if (fc == '_') cur_is_word = TRUE; else + { + int cat = UCD_CATEGORY(fc); + cur_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif /* SUPPORT_UNICODE */ + cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; + } + + /* Now see if the situation is what we want */ + + if ((*Fecode++ == OP_WORD_BOUNDARY)? + cur_is_word == prev_is_word : cur_is_word != prev_is_word) + RRETURN(MATCH_NOMATCH); + break; + + + /* ===================================================================== */ + /* Backtracking (*VERB)s, with and without arguments. Note that if the + pattern is successfully matched, we do not come back from RMATCH. */ + + case OP_MARK: + Fmark = mb->nomatch_mark = Fecode + 2; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); + + /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an + argument, and we must check whether that argument matches this MARK's + argument. It is passed back in mb->verb_skip_ptr. If it does match, we + return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject + position that corresponds to this mark. Otherwise, pass back the return + code unaltered. */ + + if (rrc == MATCH_SKIP_ARG && + PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) + { + mb->verb_skip_ptr = Feptr; /* Pass back current position */ + RRETURN(MATCH_SKIP); + } + RRETURN(rrc); + + case OP_FAIL: + RRETURN(MATCH_NOMATCH); + + /* Record the current recursing group number in mb->verb_current_recurse + when a backtracking return such as MATCH_COMMIT is given. This enables the + recurse processing to catch verbs from within the recursion. */ + + case OP_COMMIT: + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_COMMIT); + + case OP_COMMIT_ARG: + Fmark = mb->nomatch_mark = Fecode + 2; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_COMMIT); + + case OP_PRUNE: + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_PRUNE); + + case OP_PRUNE_ARG: + Fmark = mb->nomatch_mark = Fecode + 2; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_PRUNE); + + case OP_SKIP: + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_skip_ptr = Feptr; /* Pass back current position */ + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_SKIP); + + /* Note that, for Perl compatibility, SKIP with an argument does NOT set + nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was + not a matching mark, we have to re-run the match, ignoring the SKIP_ARG + that failed and any that precede it (either they also failed, or were not + triggered). To do this, we maintain a count of executed SKIP_ARGs. If a + SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg + set to the count of the one that failed. */ + + case OP_SKIP_ARG: + mb->skip_arg_count++; + if (mb->skip_arg_count <= mb->ignore_skip_arg) + { + Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; + break; + } + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + + /* Pass back the current skip name and return the special MATCH_SKIP_ARG + return code. This will either be caught by a matching MARK, or get to the + top, where it causes a rematch with mb->ignore_skip_arg set to the value of + mb->skip_arg_count. */ + + mb->verb_skip_ptr = Fecode + 2; + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_SKIP_ARG); + + /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that + the branch in which it occurs can be determined. */ + + case OP_THEN: + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_ecode_ptr = Fecode; + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_THEN); + + case OP_THEN_ARG: + Fmark = mb->nomatch_mark = Fecode + 2; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_ecode_ptr = Fecode; + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_THEN); + + + /* ===================================================================== */ + /* There's been some horrible disaster. Arrival here can only mean there is + something seriously wrong in the code above or the OP_xxx definitions. */ + + default: + return PCRE2_ERROR_INTERNAL; + } + + /* Do not insert any code in here without much thought; it is assumed + that "continue" in the code above comes out to here to repeat the main + loop. */ + + } /* End of main loop */ +/* Control never reaches here */ + + +/* ========================================================================= */ +/* The RRETURN() macro jumps here. The number that is saved in Freturn_id +indicates which label we actually want to return to. The value in Frdepth is +the index number of the frame in the vector. The return value has been placed +in rrc. */ + +#define LBL(val) case val: goto L_RM##val; + +RETURN_SWITCH: +if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; +if (Frdepth == 0) return rrc; /* Exit from the top level */ +F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ +mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ + +#ifdef DEBUG_SHOW_RMATCH +fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id); +#endif + +switch (Freturn_id) + { + LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) + LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) + LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) + LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) + LBL(33) LBL(34) LBL(35) LBL(36) + +#ifdef SUPPORT_WIDE_CHARS + LBL(100) LBL(101) +#endif + +#ifdef SUPPORT_UNICODE + LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) + LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) + LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) + LBL(221) LBL(222) LBL(223) LBL(224) LBL(225) +#endif + + default: + return PCRE2_ERROR_INTERNAL; + } +#undef LBL +} + + +/************************************************* +* Match a Regular Expression * +*************************************************/ + +/* This function applies a compiled pattern to a subject string and picks out +portions of the string if it matches. Two elements in the vector are set for +each substring: the offsets to the start and end of the substring. + +Arguments: + code points to the compiled expression + subject points to the subject string + length length of subject string (may contain binary zeros) + start_offset where to start in the subject string + options option bits + match_data points to a match_data block + mcontext points a PCRE2 context + +Returns: > 0 => success; value is the number of ovector pairs filled + = 0 => success, but ovector is not big enough + = -1 => failed to match (PCRE2_ERROR_NOMATCH) + = -2 => partial match (PCRE2_ERROR_PARTIAL) + < -2 => some kind of unexpected problem +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, + PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext) +{ +int rc; +int was_zero_terminated = 0; +const uint8_t *start_bits = NULL; +const pcre2_real_code *re = (const pcre2_real_code *)code; + +BOOL anchored; +BOOL firstline; +BOOL has_first_cu = FALSE; +BOOL has_req_cu = FALSE; +BOOL startline; + +#if PCRE2_CODE_UNIT_WIDTH == 8 +PCRE2_SPTR memchr_found_first_cu; +PCRE2_SPTR memchr_found_first_cu2; +#endif + +PCRE2_UCHAR first_cu = 0; +PCRE2_UCHAR first_cu2 = 0; +PCRE2_UCHAR req_cu = 0; +PCRE2_UCHAR req_cu2 = 0; + +PCRE2_SPTR bumpalong_limit; +PCRE2_SPTR end_subject; +PCRE2_SPTR true_end_subject; +PCRE2_SPTR start_match; +PCRE2_SPTR req_cu_ptr; +PCRE2_SPTR start_partial; +PCRE2_SPTR match_partial; + +#ifdef SUPPORT_JIT +BOOL use_jit; +#endif + +/* This flag is needed even when Unicode is not supported for convenience +(it is used by the IS_NEWLINE macro). */ + +BOOL utf = FALSE; + +#ifdef SUPPORT_UNICODE +BOOL ucp = FALSE; +BOOL allow_invalid; +uint32_t fragment_options = 0; +#ifdef SUPPORT_JIT +BOOL jit_checked_utf = FALSE; +#endif +#endif /* SUPPORT_UNICODE */ + +PCRE2_SIZE frame_size; +PCRE2_SIZE heapframes_size; + +/* We need to have mb as a pointer to a match block, because the IS_NEWLINE +macro is used below, and it expects NLBLOCK to be defined as a pointer. */ + +pcre2_callout_block cb; +match_block actual_match_block; +match_block *mb = &actual_match_block; + +/* Recognize NULL, length 0 as an empty string. */ + +if (subject == NULL && length == 0) subject = (PCRE2_SPTR)""; + +/* Plausibility checks */ + +if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; +if (code == NULL || subject == NULL || match_data == NULL) + return PCRE2_ERROR_NULL; + +start_match = subject + start_offset; +req_cu_ptr = start_match - 1; +if (length == PCRE2_ZERO_TERMINATED) + { + length = PRIV(strlen)(subject); + was_zero_terminated = 1; + } +true_end_subject = end_subject = subject + length; + +if (start_offset > length) return PCRE2_ERROR_BADOFFSET; + +/* Check that the first field in the block is the magic number. */ + +if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; + +/* Check the code unit width. */ + +if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) + return PCRE2_ERROR_BADMODE; + +/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the +options variable for this function. Users of PCRE2 who are not calling the +function directly would like to have a way of setting these flags, in the same +way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with +constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and +(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now +transfer to the options for this function. The bits are guaranteed to be +adjacent, but do not have the same values. This bit of Boolean trickery assumes +that the match-time bits are not more significant than the flag bits. If by +accident this is not the case, a compile-time division by zero error will +occur. */ + +#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) +#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) +options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); +#undef FF +#undef OO + +/* If the pattern was successfully studied with JIT support, we will run the +JIT executable instead of the rest of this function. Most options must be set +at compile time for the JIT code to be usable. */ + +#ifdef SUPPORT_JIT +use_jit = (re->executable_jit != NULL && + (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); +#endif + +/* Initialize UTF/UCP parameters. */ + +#ifdef SUPPORT_UNICODE +utf = (re->overall_options & PCRE2_UTF) != 0; +allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; +ucp = (re->overall_options & PCRE2_UCP) != 0; +#endif /* SUPPORT_UNICODE */ + +/* Convert the partial matching flags into an integer. */ + +mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : + ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; + +/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same +time. */ + +if (mb->partial != 0 && + ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) + return PCRE2_ERROR_BADOPTION; + +/* It is an error to set an offset limit without setting the flag at compile +time. */ + +if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && + (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) + return PCRE2_ERROR_BADOFFSETLIMIT; + +/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, +free the memory that was obtained. Set the field to NULL for no match cases. */ + +if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) + { + match_data->memctl.free((void *)match_data->subject, + match_data->memctl.memory_data); + match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; + } +match_data->subject = NULL; + +/* Zero the error offset in case the first code unit is invalid UTF. */ + +match_data->startchar = 0; + + +/* ============================= JIT matching ============================== */ + +/* Prepare for JIT matching. Check a UTF string for validity unless no check is +requested or invalid UTF can be handled. We check only the portion of the +subject that might be be inspected during matching - from the offset minus the +maximum lookbehind to the given length. This saves time when a small part of a +large subject is being matched by the use of a starting offset. Note that the +maximum lookbehind is a number of characters, not code units. */ + +#ifdef SUPPORT_JIT +if (use_jit) + { +#ifdef SUPPORT_UNICODE + if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) + { +#if PCRE2_CODE_UNIT_WIDTH != 32 + unsigned int i; +#endif + + /* For 8-bit and 16-bit UTF, check that the first code unit is a valid + character start. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + { + if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; +#if PCRE2_CODE_UNIT_WIDTH == 8 + return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ +#else + return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ +#endif + } +#endif /* WIDTH != 32 */ + + /* Move back by the maximum lookbehind, just in case it happens at the very + start of matching. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + for (i = re->max_lookbehind; i > 0 && start_match > subject; i--) + { + start_match--; + while (start_match > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*start_match & 0xc0) == 0x80) +#else /* 16-bit */ + (*start_match & 0xfc00) == 0xdc00) +#endif + start_match--; + } +#else /* PCRE2_CODE_UNIT_WIDTH != 32 */ + + /* In the 32-bit library, one code unit equals one character. However, + we cannot just subtract the lookbehind and then compare pointers, because + a very large lookbehind could create an invalid pointer. */ + + if (start_offset >= re->max_lookbehind) + start_match -= re->max_lookbehind; + else + start_match = subject; +#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + + /* Validate the relevant portion of the subject. Adjust the offset of an + invalid code point to be an absolute offset in the whole string. */ + + match_data->rc = PRIV(valid_utf)(start_match, + length - (start_match - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += start_match - subject; + return match_data->rc; + } + jit_checked_utf = TRUE; + } +#endif /* SUPPORT_UNICODE */ + + /* If JIT returns BADOPTION, which means that the selected complete or + partial matching mode was not compiled, fall through to the interpreter. */ + + rc = pcre2_jit_match(code, subject, length, start_offset, options, + match_data, mcontext); + if (rc != PCRE2_ERROR_JIT_BADOPTION) + { + if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) + { + length = CU2BYTES(length + was_zero_terminated); + match_data->subject = match_data->memctl.malloc(length, + match_data->memctl.memory_data); + if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy((void *)match_data->subject, subject, length); + match_data->flags |= PCRE2_MD_COPIED_SUBJECT; + } + return rc; + } + } +#endif /* SUPPORT_JIT */ + +/* ========================= End of JIT matching ========================== */ + + +/* Proceed with non-JIT matching. The default is to allow lookbehinds to the +start of the subject. A UTF check when there is a non-zero offset may change +this. */ + +mb->check_subject = subject; + +/* If a UTF subject string was not checked for validity in the JIT code above, +check it here, and handle support for invalid UTF strings. The check above +happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. +If we get here in those circumstances, it means the subject string is valid, +but for some reason JIT matching was not successful. There is no need to check +the subject again. + +We check only the portion of the subject that might be be inspected during +matching - from the offset minus the maximum lookbehind to the given length. +This saves time when a small part of a large subject is being matched by the +use of a starting offset. Note that the maximum lookbehind is a number of +characters, not code units. + +Note also that support for invalid UTF forces a check, overriding the setting +of PCRE2_NO_CHECK_UTF. */ + +#ifdef SUPPORT_UNICODE +if (utf && +#ifdef SUPPORT_JIT + !jit_checked_utf && +#endif + ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) + { +#if PCRE2_CODE_UNIT_WIDTH != 32 + BOOL skipped_bad_start = FALSE; +#endif + + /* For 8-bit and 16-bit UTF, check that the first code unit is a valid + character start. If we are handling invalid UTF, just skip over such code + units. Otherwise, give an appropriate error. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + if (allow_invalid) + { + while (start_match < end_subject && NOT_FIRSTCU(*start_match)) + { + start_match++; + skipped_bad_start = TRUE; + } + } + else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + { + if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; +#if PCRE2_CODE_UNIT_WIDTH == 8 + return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ +#else + return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ +#endif + } +#endif /* WIDTH != 32 */ + + /* The mb->check_subject field points to the start of UTF checking; + lookbehinds can go back no further than this. */ + + mb->check_subject = start_match; + + /* Move back by the maximum lookbehind, just in case it happens at the very + start of matching, but don't do this if we skipped bad 8-bit or 16-bit code + units above. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + if (!skipped_bad_start) + { + unsigned int i; + for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) + { + mb->check_subject--; + while (mb->check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*mb->check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*mb->check_subject & 0xfc00) == 0xdc00) +#endif + mb->check_subject--; + } + } +#else /* PCRE2_CODE_UNIT_WIDTH != 32 */ + + /* In the 32-bit library, one code unit equals one character. However, + we cannot just subtract the lookbehind and then compare pointers, because + a very large lookbehind could create an invalid pointer. */ + + if (start_offset >= re->max_lookbehind) + mb->check_subject -= re->max_lookbehind; + else + mb->check_subject = subject; +#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + + /* Validate the relevant portion of the subject. There's a loop in case we + encounter bad UTF in the characters preceding start_match which we are + scanning because of a lookbehind. */ + + for (;;) + { + match_data->rc = PRIV(valid_utf)(mb->check_subject, + length - (mb->check_subject - subject), &(match_data->startchar)); + + if (match_data->rc == 0) break; /* Valid UTF string */ + + /* Invalid UTF string. Adjust the offset to be an absolute offset in the + whole string. If we are handling invalid UTF strings, set end_subject to + stop before the bad code unit, and set the options to "not end of line". + Otherwise return the error. */ + + match_data->startchar += mb->check_subject - subject; + if (!allow_invalid || match_data->rc > 0) return match_data->rc; + end_subject = subject + match_data->startchar; + + /* If the end precedes start_match, it means there is invalid UTF in the + extra code units we reversed over because of a lookbehind. Advance past the + first bad code unit, and then skip invalid character starting code units in + 8-bit and 16-bit modes, and try again with the original end point. */ + + if (end_subject < start_match) + { + mb->check_subject = end_subject + 1; +#if PCRE2_CODE_UNIT_WIDTH != 32 + while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) + mb->check_subject++; +#endif + end_subject = true_end_subject; + } + + /* Otherwise, set the not end of line option, and do the match. */ + + else + { + fragment_options = PCRE2_NOTEOL; + break; + } + } + } +#endif /* SUPPORT_UNICODE */ + +/* A NULL match context means "use a default context", but we take the memory +control functions from the pattern. */ + +if (mcontext == NULL) + { + mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); + mb->memctl = re->memctl; + } +else mb->memctl = mcontext->memctl; + +anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; +firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; +startline = (re->flags & PCRE2_STARTLINE) != 0; +bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? + true_end_subject : subject + mcontext->offset_limit; + +/* Initialize and set up the fixed fields in the callout block, with a pointer +in the match block. */ + +mb->cb = &cb; +cb.version = 2; +cb.subject = subject; +cb.subject_length = (PCRE2_SIZE)(end_subject - subject); +cb.callout_flags = 0; + +/* Fill in the remaining fields in the match block, except for moptions, which +gets set later. */ + +mb->callout = mcontext->callout; +mb->callout_data = mcontext->callout_data; + +mb->start_subject = subject; +mb->start_offset = start_offset; +mb->end_subject = end_subject; +mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; +mb->allowemptypartial = (re->max_lookbehind > 0) || + (re->flags & PCRE2_MATCH_EMPTY) != 0; +mb->poptions = re->overall_options; /* Pattern options */ +mb->ignore_skip_arg = 0; +mb->mark = mb->nomatch_mark = NULL; /* In case never set */ + +/* The name table is needed for finding all the numbers associated with a +given name, for condition testing. The code follows the name table. */ + +mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); +mb->name_count = re->name_count; +mb->name_entry_size = re->name_entry_size; +mb->start_code = mb->name_table + re->name_count * re->name_entry_size; + +/* Process the \R and newline settings. */ + +mb->bsr_convention = re->bsr_convention; +mb->nltype = NLTYPE_FIXED; +switch(re->newline_convention) + { + case PCRE2_NEWLINE_CR: + mb->nllen = 1; + mb->nl[0] = CHAR_CR; + break; + + case PCRE2_NEWLINE_LF: + mb->nllen = 1; + mb->nl[0] = CHAR_NL; + break; + + case PCRE2_NEWLINE_NUL: + mb->nllen = 1; + mb->nl[0] = CHAR_NUL; + break; + + case PCRE2_NEWLINE_CRLF: + mb->nllen = 2; + mb->nl[0] = CHAR_CR; + mb->nl[1] = CHAR_NL; + break; + + case PCRE2_NEWLINE_ANY: + mb->nltype = NLTYPE_ANY; + break; + + case PCRE2_NEWLINE_ANYCRLF: + mb->nltype = NLTYPE_ANYCRLF; + break; + + default: return PCRE2_ERROR_INTERNAL; + } + +/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE +vector at the end, whose size depends on the number of capturing parentheses in +the pattern. It is not used at all if there are no capturing parentheses. + + frame_size is the total size of each frame + match_data->heapframes is the pointer to the frames vector + match_data->heapframes_size is the allocated size of the vector + +We must pad the frame_size for alignment to ensure subsequent frames are as +aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE +array, that does not guarantee it is suitably aligned for pointers, as some +architectures have pointers that are larger than a size_t. */ + +frame_size = (offsetof(heapframe, ovector) + + re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) & + ~(HEAPFRAME_ALIGNMENT - 1); + +/* Limits set in the pattern override the match context only if they are +smaller. */ + +mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)? + mcontext->heap_limit : re->limit_heap); + +mb->match_limit = (mcontext->match_limit < re->limit_match)? + mcontext->match_limit : re->limit_match; + +mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? + mcontext->depth_limit : re->limit_depth; + +/* If a pattern has very many capturing parentheses, the frame size may be very +large. Set the initial frame vector size to ensure that there are at least 10 +available frames, but enforce a minimum of START_FRAMES_SIZE. If this is +greater than the heap limit, get as large a vector as possible. Always round +the size to a multiple of the frame size. */ + +heapframes_size = frame_size * 10; +if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE; +if (heapframes_size / 1024 > mb->heap_limit) + { + PCRE2_SIZE max_size = 1024 * mb->heap_limit; + if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT; + heapframes_size = max_size; + } + +/* If an existing frame vector in the match_data block is large enough, we can +use it. Otherwise, free any pre-existing vector and get a new one. */ + +if (match_data->heapframes_size < heapframes_size) + { + match_data->memctl.free(match_data->heapframes, + match_data->memctl.memory_data); + match_data->heapframes = match_data->memctl.malloc(heapframes_size, + match_data->memctl.memory_data); + if (match_data->heapframes == NULL) + { + match_data->heapframes_size = 0; + return PCRE2_ERROR_NOMEMORY; + } + match_data->heapframes_size = heapframes_size; + } + +/* Write to the ovector within the first frame to mark every capture unset and +to avoid uninitialized memory read errors when it is copied to a new frame. */ + +memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff, + frame_size - offsetof(heapframe, ovector)); + +/* Pointers to the individual character tables */ + +mb->lcc = re->tables + lcc_offset; +mb->fcc = re->tables + fcc_offset; +mb->ctypes = re->tables + ctypes_offset; + +/* Set up the first code unit to match, if available. If there's no first code +unit there may be a bitmap of possible first characters. */ + +if ((re->flags & PCRE2_FIRSTSET) != 0) + { + has_first_cu = TRUE; + first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); + if ((re->flags & PCRE2_FIRSTCASELESS) != 0) + { + first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); +#else + if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); +#endif +#endif /* SUPPORT_UNICODE */ + } + } +else + if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) + start_bits = re->start_bitmap; + +/* There may also be a "last known required character" set. */ + +if ((re->flags & PCRE2_LASTSET) != 0) + { + has_req_cu = TRUE; + req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); + if ((re->flags & PCRE2_LASTCASELESS) != 0) + { + req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); +#endif +#endif /* SUPPORT_UNICODE */ + } + } + + +/* ==========================================================================*/ + +/* Loop for handling unanchored repeated matching attempts; for anchored regexs +the loop runs just once. */ + +#ifdef SUPPORT_UNICODE +FRAGMENT_RESTART: +#endif + +start_partial = match_partial = NULL; +mb->hitend = FALSE; + +#if PCRE2_CODE_UNIT_WIDTH == 8 +memchr_found_first_cu = NULL; +memchr_found_first_cu2 = NULL; +#endif + +for(;;) + { + PCRE2_SPTR new_start_match; + + /* ----------------- Start of match optimizations ---------------- */ + + /* There are some optimizations that avoid running the match if a known + starting point is not found, or if a known later code unit is not present. + However, there is an option (settable at compile time) that disables these, + for testing and for ensuring that all callouts do actually occur. */ + + if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) + { + /* If firstline is TRUE, the start of the match is constrained to the first + line of a multiline string. That is, the match must be before or at the + first newline following the start of matching. Temporarily adjust + end_subject so that we stop the scans for a first code unit at a newline. + If the match fails at the newline, later code breaks the loop. */ + + if (firstline) + { + PCRE2_SPTR t = start_match; +#ifdef SUPPORT_UNICODE + if (utf) + { + while (t < end_subject && !IS_NEWLINE(t)) + { + t++; + ACROSSCHAR(t < end_subject, t, t++); + } + } + else +#endif + while (t < end_subject && !IS_NEWLINE(t)) t++; + end_subject = t; + } + + /* Anchored: check the first code unit if one is recorded. This may seem + pointless but it can help in detecting a no match case without scanning for + the required code unit. */ + + if (anchored) + { + if (has_first_cu || start_bits != NULL) + { + BOOL ok = start_match < end_subject; + if (ok) + { + PCRE2_UCHAR c = UCHAR21TEST(start_match); + ok = has_first_cu && (c == first_cu || c == first_cu2); + if (!ok && start_bits != NULL) + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + ok = (start_bits[c/8] & (1u << (c&7))) != 0; + } + } + if (!ok) + { + rc = MATCH_NOMATCH; + break; + } + } + } + + /* Not anchored. Advance to a unique first code unit if there is one. */ + + else + { + if (has_first_cu) + { + if (first_cu != first_cu2) /* Caseless */ + { + /* In 16-bit and 32_bit modes we have to do our own search, so can + look for both cases at once. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + PCRE2_UCHAR smc; + while (start_match < end_subject && + (smc = UCHAR21TEST(start_match)) != first_cu && + smc != first_cu2) + start_match++; +#else + /* In 8-bit mode, the use of memchr() gives a big speed up, even + though we have to call it twice in order to find the earliest + occurrence of the code unit in either of its cases. Caching is used + to remember the positions of previously found code units. This can + make a huge difference when the strings are very long and only one + case is actually present. */ + + PCRE2_SPTR pp1 = NULL; + PCRE2_SPTR pp2 = NULL; + PCRE2_SIZE searchlength = end_subject - start_match; + + /* If we haven't got a previously found position for first_cu, or if + the current starting position is later, we need to do a search. If + the code unit is not found, set it to the end. */ + + if (memchr_found_first_cu == NULL || + start_match > memchr_found_first_cu) + { + pp1 = memchr(start_match, first_cu, searchlength); + memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; + } + + /* If the start is before a previously found position, use the + previous position, or NULL if a previous search failed. */ + + else pp1 = (memchr_found_first_cu == end_subject)? NULL : + memchr_found_first_cu; + + /* Do the same thing for the other case. */ + + if (memchr_found_first_cu2 == NULL || + start_match > memchr_found_first_cu2) + { + pp2 = memchr(start_match, first_cu2, searchlength); + memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; + } + + else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : + memchr_found_first_cu2; + + /* Set the start to the end of the subject if neither case was found. + Otherwise, use the earlier found point. */ + + if (pp1 == NULL) + start_match = (pp2 == NULL)? end_subject : pp2; + else + start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; + +#endif /* 8-bit handling */ + } + + /* The caseful case is much simpler. */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (start_match < end_subject && UCHAR21TEST(start_match) != + first_cu) + start_match++; +#else + start_match = memchr(start_match, first_cu, end_subject - start_match); + if (start_match == NULL) start_match = end_subject; +#endif + } + + /* If we can't find the required first code unit, having reached the + true end of the subject, break the bumpalong loop, to force a match + failure, except when doing partial matching, when we let the next cycle + run at the end of the subject. To see why, consider the pattern + /(?<=abc)def/, which partially matches "abc", even though the string + does not contain the starting character "d". If we have not reached the + true end of the subject (PCRE2_FIRSTLINE caused end_subject to be + temporarily modified) we also let the cycle run, because the matching + string is legitimately allowed to start with the first code unit of a + newline. */ + + if (mb->partial == 0 && start_match >= mb->end_subject) + { + rc = MATCH_NOMATCH; + break; + } + } + + /* If there's no first code unit, advance to just after a linebreak for a + multiline match if required. */ + + else if (startline) + { + if (start_match > mb->start_subject + start_offset) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + ACROSSCHAR(start_match < end_subject, start_match, start_match++); + } + } + else +#endif + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; + + /* If we have just passed a CR and the newline option is ANY or + ANYCRLF, and we are now at a LF, advance the match position by one + more code unit. */ + + if (start_match[-1] == CHAR_CR && + (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + UCHAR21TEST(start_match) == CHAR_NL) + start_match++; + } + } + + /* If there's no first code unit or a requirement for a multiline line + start, advance to a non-unique first code unit if any have been + identified. The bitmap contains only 256 bits. When code units are 16 or + 32 bits wide, all code units greater than 254 set the 255 bit. */ + + else if (start_bits != NULL) + { + while (start_match < end_subject) + { + uint32_t c = UCHAR21TEST(start_match); +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + if ((start_bits[c/8] & (1u << (c&7))) != 0) break; + start_match++; + } + + /* See comment above in first_cu checking about the next few lines. */ + + if (mb->partial == 0 && start_match >= mb->end_subject) + { + rc = MATCH_NOMATCH; + break; + } + } + } /* End first code unit handling */ + + /* Restore fudged end_subject */ + + end_subject = mb->end_subject; + + /* The following two optimizations must be disabled for partial matching. */ + + if (mb->partial == 0) + { + PCRE2_SPTR p; + + /* The minimum matching length is a lower bound; no string of that length + may actually match the pattern. Although the value is, strictly, in + characters, we treat it as code units to avoid spending too much time in + this optimization. */ + + if (end_subject - start_match < re->minlength) + { + rc = MATCH_NOMATCH; + break; + } + + /* If req_cu is set, we know that that code unit must appear in the + subject for the (non-partial) match to succeed. If the first code unit is + set, req_cu must be later in the subject; otherwise the test starts at + the match point. This optimization can save a huge amount of backtracking + in patterns with nested unlimited repeats that aren't going to match. + Writing separate code for caseful/caseless versions makes it go faster, + as does using an autoincrement and backing off on a match. As in the case + of the first code unit, using memchr() in the 8-bit library gives a big + speed up. Unlike the first_cu check above, we do not need to call + memchr() twice in the caseless case because we only need to check for the + presence of the character in either case, not find the first occurrence. + + The search can be skipped if the code unit was found later than the + current starting point in a previous iteration of the bumpalong loop. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary + anchored patterns. This showed up when somebody was matching something + like /^\d+C/ on a 32-megabyte string... so we don't do this when the + string is sufficiently long, but it's worth searching a lot more for + unanchored patterns. */ + + p = start_match + (has_first_cu? 1:0); + if (has_req_cu && p > req_cu_ptr) + { + PCRE2_SIZE check_length = end_subject - start_match; + + if (check_length < REQ_CU_MAX || + (!anchored && check_length < REQ_CU_MAX * 1000)) + { + if (req_cu != req_cu2) /* Caseless */ + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (p < end_subject) + { + uint32_t pp = UCHAR21INCTEST(p); + if (pp == req_cu || pp == req_cu2) { p--; break; } + } +#else /* 8-bit code units */ + PCRE2_SPTR pp = p; + p = memchr(pp, req_cu, end_subject - pp); + if (p == NULL) + { + p = memchr(pp, req_cu2, end_subject - pp); + if (p == NULL) p = end_subject; + } +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ + } + + /* The caseful case */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (p < end_subject) + { + if (UCHAR21INCTEST(p) == req_cu) { p--; break; } + } + +#else /* 8-bit code units */ + p = memchr(p, req_cu, end_subject - p); + if (p == NULL) p = end_subject; +#endif + } + + /* If we can't find the required code unit, break the bumpalong loop, + forcing a match failure. */ + + if (p >= end_subject) + { + rc = MATCH_NOMATCH; + break; + } + + /* If we have found the required code unit, save the point where we + found it, so that we don't search again next time round the bumpalong + loop if the start hasn't yet passed this code unit. */ + + req_cu_ptr = p; + } + } + } + } + + /* ------------ End of start of match optimizations ------------ */ + + /* Give no match if we have passed the bumpalong limit. */ + + if (start_match > bumpalong_limit) + { + rc = MATCH_NOMATCH; + break; + } + + /* OK, we can now run the match. If "hitend" is set afterwards, remember the + first starting point for which a partial match was found. */ + + cb.start_match = (PCRE2_SIZE)(start_match - subject); + cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; + + mb->start_used_ptr = start_match; + mb->last_used_ptr = start_match; +#ifdef SUPPORT_UNICODE + mb->moptions = options | fragment_options; +#else + mb->moptions = options; +#endif + mb->match_call_count = 0; + mb->end_offset_top = 0; + mb->skip_arg_count = 0; + + rc = match(start_match, mb->start_code, re->top_bracket, frame_size, + match_data, mb); + + if (mb->hitend && start_partial == NULL) + { + start_partial = mb->start_used_ptr; + match_partial = start_match; + } + + switch(rc) + { + /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched + the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP + entirely. The only way we can do that is to re-do the match at the same + point, with a flag to force SKIP with an argument to be ignored. Just + treating this case as NOMATCH does not work because it does not check other + alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ + + case MATCH_SKIP_ARG: + new_start_match = start_match; + mb->ignore_skip_arg = mb->skip_arg_count; + break; + + /* SKIP passes back the next starting point explicitly, but if it is no + greater than the match we have just done, treat it as NOMATCH. */ + + case MATCH_SKIP: + if (mb->verb_skip_ptr > start_match) + { + new_start_match = mb->verb_skip_ptr; + break; + } + /* Fall through */ + + /* NOMATCH and PRUNE advance by one character. THEN at this level acts + exactly like PRUNE. Unset ignore SKIP-with-argument. */ + + case MATCH_NOMATCH: + case MATCH_PRUNE: + case MATCH_THEN: + mb->ignore_skip_arg = 0; + new_start_match = start_match + 1; +#ifdef SUPPORT_UNICODE + if (utf) + ACROSSCHAR(new_start_match < end_subject, new_start_match, + new_start_match++); +#endif + break; + + /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ + + case MATCH_COMMIT: + rc = MATCH_NOMATCH; + goto ENDLOOP; + + /* Any other return is either a match, or some kind of error. */ + + default: + goto ENDLOOP; + } + + /* Control reaches here for the various types of "no match at this point" + result. Reset the code to MATCH_NOMATCH for subsequent checking. */ + + rc = MATCH_NOMATCH; + + /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first + newline in the subject (though it may continue over the newline). Therefore, + if we have just failed to match, starting at a newline, do not continue. */ + + if (firstline && IS_NEWLINE(start_match)) break; + + /* Advance to new matching position */ + + start_match = new_start_match; + + /* Break the loop if the pattern is anchored or if we have passed the end of + the subject. */ + + if (anchored || start_match > end_subject) break; + + /* If we have just passed a CR and we are now at a LF, and the pattern does + not contain any explicit matches for \r or \n, and the newline option is CRLF + or ANY or ANYCRLF, advance the match position by one more code unit. In + normal matching start_match will aways be greater than the first position at + this stage, but a failed *SKIP can cause a return at the same point, which is + why the first test exists. */ + + if (start_match > subject + start_offset && + start_match[-1] == CHAR_CR && + start_match < end_subject && + *start_match == CHAR_NL && + (re->flags & PCRE2_HASCRORLF) == 0 && + (mb->nltype == NLTYPE_ANY || + mb->nltype == NLTYPE_ANYCRLF || + mb->nllen == 2)) + start_match++; + + mb->mark = NULL; /* Reset for start of next match attempt */ + } /* End of for(;;) "bumpalong" loop */ + +/* ==========================================================================*/ + +/* When we reach here, one of the following stopping conditions is true: + +(1) The match succeeded, either completely, or partially; + +(2) The pattern is anchored or the match was failed after (*COMMIT); + +(3) We are past the end of the subject or the bumpalong limit; + +(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because + this option requests that a match occur at or before the first newline in + the subject. + +(5) Some kind of error occurred. + +*/ + +ENDLOOP: + +/* If end_subject != true_end_subject, it means we are handling invalid UTF, +and have just processed a non-terminal fragment. If this resulted in no match +or a partial match we must carry on to the next fragment (a partial match is +returned to the caller only at the very end of the subject). A loop is used to +avoid trying to match against empty fragments; if the pattern can match an +empty string it would have done so already. */ + +#ifdef SUPPORT_UNICODE +if (utf && end_subject != true_end_subject && + (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) + { + for (;;) + { + /* Advance past the first bad code unit, and then skip invalid character + starting code units in 8-bit and 16-bit modes. */ + + start_match = end_subject + 1; + +#if PCRE2_CODE_UNIT_WIDTH != 32 + while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) + start_match++; +#endif + + /* If we have hit the end of the subject, there isn't another non-empty + fragment, so give up. */ + + if (start_match >= true_end_subject) + { + rc = MATCH_NOMATCH; /* In case it was partial */ + break; + } + + /* Check the rest of the subject */ + + mb->check_subject = start_match; + rc = PRIV(valid_utf)(start_match, length - (start_match - subject), + &(match_data->startchar)); + + /* The rest of the subject is valid UTF. */ + + if (rc == 0) + { + mb->end_subject = end_subject = true_end_subject; + fragment_options = PCRE2_NOTBOL; + goto FRAGMENT_RESTART; + } + + /* A subsequent UTF error has been found; if the next fragment is + non-empty, set up to process it. Otherwise, let the loop advance. */ + + else if (rc < 0) + { + mb->end_subject = end_subject = start_match + match_data->startchar; + if (end_subject > start_match) + { + fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; + goto FRAGMENT_RESTART; + } + } + } + } +#endif /* SUPPORT_UNICODE */ + +/* Fill in fields that are always returned in the match data. */ + +match_data->code = re; +match_data->mark = mb->mark; +match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; + +/* Handle a fully successful match. Set the return code to the number of +captured strings, or 0 if there were too many to fit into the ovector, and then +set the remaining returned values before returning. Make a copy of the subject +string if requested. */ + +if (rc == MATCH_MATCH) + { + match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? + 0 : (int)mb->end_offset_top/2 + 1; + match_data->startchar = start_match - subject; + match_data->leftchar = mb->start_used_ptr - subject; + match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? + mb->last_used_ptr : mb->end_match_ptr) - subject; + if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) + { + length = CU2BYTES(length + was_zero_terminated); + match_data->subject = match_data->memctl.malloc(length, + match_data->memctl.memory_data); + if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy((void *)match_data->subject, subject, length); + match_data->flags |= PCRE2_MD_COPIED_SUBJECT; + } + else match_data->subject = subject; + return match_data->rc; + } + +/* Control gets here if there has been a partial match, an error, or if the +overall match attempt has failed at all permitted starting positions. Any mark +data is in the nomatch_mark field. */ + +match_data->mark = mb->nomatch_mark; + +/* For anything other than nomatch or partial match, just return the code. */ + +if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; + +/* Handle a partial match. If a "soft" partial match was requested, searching +for a complete match will have continued, and the value of rc at this point +will be MATCH_NOMATCH. For a "hard" partial match, it will already be +PCRE2_ERROR_PARTIAL. */ + +else if (match_partial != NULL) + { + match_data->subject = subject; + match_data->ovector[0] = match_partial - subject; + match_data->ovector[1] = end_subject - subject; + match_data->startchar = match_partial - subject; + match_data->leftchar = start_partial - subject; + match_data->rightchar = end_subject - subject; + match_data->rc = PCRE2_ERROR_PARTIAL; + } + +/* Else this is the classic nomatch case. */ + +else match_data->rc = PCRE2_ERROR_NOMATCH; + +return match_data->rc; +} + +/* These #undefs are here to enable unity builds with CMake. */ + +#undef NLBLOCK /* Block containing newline information */ +#undef PSSTART /* Field containing processed string start */ +#undef PSEND /* Field containing processed string end */ + +/* End of pcre2_match.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_match_data.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_match_data.c new file mode 100644 index 0000000000..5213b40a2b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_match_data.c @@ -0,0 +1,185 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + + +/************************************************* +* Create a match data block given ovector size * +*************************************************/ + +/* A minimum of 1 is imposed on the number of ovector pairs. A maximum is also +imposed because the oveccount field in a match data block is uintt6_t. */ + +PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION +pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext) +{ +pcre2_match_data *yield; +if (oveccount < 1) oveccount = 1; +if (oveccount > UINT16_MAX) oveccount = UINT16_MAX; +yield = PRIV(memctl_malloc)( + offsetof(pcre2_match_data, ovector) + 2*oveccount*sizeof(PCRE2_SIZE), + (pcre2_memctl *)gcontext); +if (yield == NULL) return NULL; +yield->oveccount = oveccount; +yield->flags = 0; +yield->heapframes = NULL; +yield->heapframes_size = 0; +return yield; +} + + + +/************************************************* +* Create a match data block using pattern data * +*************************************************/ + +/* If no context is supplied, use the memory allocator from the code. */ + +PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION +pcre2_match_data_create_from_pattern(const pcre2_code *code, + pcre2_general_context *gcontext) +{ +if (gcontext == NULL) gcontext = (pcre2_general_context *)code; +return pcre2_match_data_create(((pcre2_real_code *)code)->top_bracket + 1, + gcontext); +} + + + +/************************************************* +* Free a match data block * +*************************************************/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_match_data_free(pcre2_match_data *match_data) +{ +if (match_data != NULL) + { + if (match_data->heapframes != NULL) + match_data->memctl.free(match_data->heapframes, + match_data->memctl.memory_data); + if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) + match_data->memctl.free((void *)match_data->subject, + match_data->memctl.memory_data); + match_data->memctl.free(match_data, match_data->memctl.memory_data); + } +} + + + +/************************************************* +* Get last mark in match * +*************************************************/ + +PCRE2_EXP_DEFN PCRE2_SPTR PCRE2_CALL_CONVENTION +pcre2_get_mark(pcre2_match_data *match_data) +{ +return match_data->mark; +} + + + +/************************************************* +* Get pointer to ovector * +*************************************************/ + +PCRE2_EXP_DEFN PCRE2_SIZE * PCRE2_CALL_CONVENTION +pcre2_get_ovector_pointer(pcre2_match_data *match_data) +{ +return match_data->ovector; +} + + + +/************************************************* +* Get number of ovector slots * +*************************************************/ + +PCRE2_EXP_DEFN uint32_t PCRE2_CALL_CONVENTION +pcre2_get_ovector_count(pcre2_match_data *match_data) +{ +return match_data->oveccount; +} + + + +/************************************************* +* Get starting code unit in match * +*************************************************/ + +PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION +pcre2_get_startchar(pcre2_match_data *match_data) +{ +return match_data->startchar; +} + + + +/************************************************* +* Get size of match data block * +*************************************************/ + +PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION +pcre2_get_match_data_size(pcre2_match_data *match_data) +{ +return offsetof(pcre2_match_data, ovector) + + 2 * (match_data->oveccount) * sizeof(PCRE2_SIZE); +} + + + +/************************************************* +* Get heapframes size * +*************************************************/ + +PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION +pcre2_get_match_data_heapframes_size(pcre2_match_data *match_data) +{ +return match_data->heapframes_size; +} + +/* End of pcre2_match_data.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_newline.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_newline.c new file mode 100644 index 0000000000..cf44e90379 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_newline.c @@ -0,0 +1,243 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains internal functions for testing newlines when more than +one kind of newline is to be recognized. When a newline is found, its length is +returned. In principle, we could implement several newline "types", each +referring to a different set of newline characters. At present, PCRE2 supports +only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF, +and NLTYPE_ANY. The full list of Unicode newline characters is taken from +http://unicode.org/unicode/reports/tr18/. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + + +/************************************************* +* Check for newline at given position * +*************************************************/ + +/* This function is called only via the IS_NEWLINE macro, which does so only +when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed +newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit +pointed to by ptr is less than the end of the string. + +Arguments: + ptr pointer to possible newline + type the newline type + endptr pointer to the end of the string + lenptr where to return the length + utf TRUE if in utf mode + +Returns: TRUE or FALSE +*/ + +BOOL +PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr, + uint32_t *lenptr, BOOL utf) +{ +uint32_t c; + +#ifdef SUPPORT_UNICODE +if (utf) { GETCHAR(c, ptr); } else c = *ptr; +#else +(void)utf; +c = *ptr; +#endif /* SUPPORT_UNICODE */ + +if (type == NLTYPE_ANYCRLF) switch(c) + { + case CHAR_LF: + *lenptr = 1; + return TRUE; + + case CHAR_CR: + *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; + return TRUE; + + default: + return FALSE; + } + +/* NLTYPE_ANY */ + +else switch(c) + { +#ifdef EBCDIC + case CHAR_NEL: +#endif + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + *lenptr = 1; + return TRUE; + + case CHAR_CR: + *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; + return TRUE; + +#ifndef EBCDIC +#if PCRE2_CODE_UNIT_WIDTH == 8 + case CHAR_NEL: + *lenptr = utf? 2 : 1; + return TRUE; + + case 0x2028: /* LS */ + case 0x2029: /* PS */ + *lenptr = 3; + return TRUE; + +#else /* 16-bit or 32-bit code units */ + case CHAR_NEL: + case 0x2028: /* LS */ + case 0x2029: /* PS */ + *lenptr = 1; + return TRUE; +#endif +#endif /* Not EBCDIC */ + + default: + return FALSE; + } +} + + + +/************************************************* +* Check for newline at previous position * +*************************************************/ + +/* This function is called only via the WAS_NEWLINE macro, which does so only +when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed +newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial +value of ptr is greater than the start of the string that is being processed. + +Arguments: + ptr pointer to possible newline + type the newline type + startptr pointer to the start of the string + lenptr where to return the length + utf TRUE if in utf mode + +Returns: TRUE or FALSE +*/ + +BOOL +PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr, + uint32_t *lenptr, BOOL utf) +{ +uint32_t c; +ptr--; + +#ifdef SUPPORT_UNICODE +if (utf) + { + BACKCHAR(ptr); + GETCHAR(c, ptr); + } +else c = *ptr; +#else +(void)utf; +c = *ptr; +#endif /* SUPPORT_UNICODE */ + +if (type == NLTYPE_ANYCRLF) switch(c) + { + case CHAR_LF: + *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; + return TRUE; + + case CHAR_CR: + *lenptr = 1; + return TRUE; + + default: + return FALSE; + } + +/* NLTYPE_ANY */ + +else switch(c) + { + case CHAR_LF: + *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; + return TRUE; + +#ifdef EBCDIC + case CHAR_NEL: +#endif + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + *lenptr = 1; + return TRUE; + +#ifndef EBCDIC +#if PCRE2_CODE_UNIT_WIDTH == 8 + case CHAR_NEL: + *lenptr = utf? 2 : 1; + return TRUE; + + case 0x2028: /* LS */ + case 0x2029: /* PS */ + *lenptr = 3; + return TRUE; + +#else /* 16-bit or 32-bit code units */ + case CHAR_NEL: + case 0x2028: /* LS */ + case 0x2029: /* PS */ + *lenptr = 1; + return TRUE; +#endif +#endif /* Not EBCDIC */ + + default: + return FALSE; + } +} + +/* End of pcre2_newline.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_ord2utf.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_ord2utf.c new file mode 100644 index 0000000000..f3581082ac --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_ord2utf.c @@ -0,0 +1,120 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This file contains a function that converts a Unicode character code point +into a UTF string. The behaviour is different for each code unit width. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + +/* If SUPPORT_UNICODE is not defined, this function will never be called. +Supply a dummy function because some compilers do not like empty source +modules. */ + +#ifndef SUPPORT_UNICODE +unsigned int +PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) +{ +(void)(cvalue); +(void)(buffer); +return 0; +} +#else /* SUPPORT_UNICODE */ + + +/************************************************* +* Convert code point to UTF * +*************************************************/ + +/* +Arguments: + cvalue the character value + buffer pointer to buffer for result + +Returns: number of code units placed in the buffer +*/ + +unsigned int +PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) +{ +/* Convert to UTF-8 */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +int i, j; +for (i = 0; i < PRIV(utf8_table1_size); i++) + if ((int)cvalue <= PRIV(utf8_table1)[i]) break; +buffer += i; +for (j = i; j > 0; j--) + { + *buffer-- = 0x80 | (cvalue & 0x3f); + cvalue >>= 6; + } +*buffer = PRIV(utf8_table2)[i] | cvalue; +return i + 1; + +/* Convert to UTF-16 */ + +#elif PCRE2_CODE_UNIT_WIDTH == 16 +if (cvalue <= 0xffff) + { + *buffer = (PCRE2_UCHAR)cvalue; + return 1; + } +cvalue -= 0x10000; +*buffer++ = 0xd800 | (cvalue >> 10); +*buffer = 0xdc00 | (cvalue & 0x3ff); +return 2; + +/* Convert to UTF-32 */ + +#else +*buffer = (PCRE2_UCHAR)cvalue; +return 1; +#endif +} +#endif /* SUPPORT_UNICODE */ + +/* End of pcre_ord2utf.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_pattern_info.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_pattern_info.c new file mode 100644 index 0000000000..bc0e7f82bf --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_pattern_info.c @@ -0,0 +1,432 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2018 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + +/************************************************* +* Return info about compiled pattern * +*************************************************/ + +/* +Arguments: + code points to compiled code + what what information is required + where where to put the information; if NULL, return length + +Returns: 0 when data returned + > 0 when length requested + < 0 on error or unset value +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where) +{ +const pcre2_real_code *re = (pcre2_real_code *)code; + +if (where == NULL) /* Requests field length */ + { + switch(what) + { + case PCRE2_INFO_ALLOPTIONS: + case PCRE2_INFO_ARGOPTIONS: + case PCRE2_INFO_BACKREFMAX: + case PCRE2_INFO_BSR: + case PCRE2_INFO_CAPTURECOUNT: + case PCRE2_INFO_DEPTHLIMIT: + case PCRE2_INFO_EXTRAOPTIONS: + case PCRE2_INFO_FIRSTCODETYPE: + case PCRE2_INFO_FIRSTCODEUNIT: + case PCRE2_INFO_HASBACKSLASHC: + case PCRE2_INFO_HASCRORLF: + case PCRE2_INFO_HEAPLIMIT: + case PCRE2_INFO_JCHANGED: + case PCRE2_INFO_LASTCODETYPE: + case PCRE2_INFO_LASTCODEUNIT: + case PCRE2_INFO_MATCHEMPTY: + case PCRE2_INFO_MATCHLIMIT: + case PCRE2_INFO_MAXLOOKBEHIND: + case PCRE2_INFO_MINLENGTH: + case PCRE2_INFO_NAMEENTRYSIZE: + case PCRE2_INFO_NAMECOUNT: + case PCRE2_INFO_NEWLINE: + return sizeof(uint32_t); + + case PCRE2_INFO_FIRSTBITMAP: + return sizeof(const uint8_t *); + + case PCRE2_INFO_JITSIZE: + case PCRE2_INFO_SIZE: + case PCRE2_INFO_FRAMESIZE: + return sizeof(size_t); + + case PCRE2_INFO_NAMETABLE: + return sizeof(PCRE2_SPTR); + } + } + +if (re == NULL) return PCRE2_ERROR_NULL; + +/* Check that the first field in the block is the magic number. If it is not, +return with PCRE2_ERROR_BADMAGIC. */ + +if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; + +/* Check that this pattern was compiled in the correct bit mode */ + +if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE; + +switch(what) + { + case PCRE2_INFO_ALLOPTIONS: + *((uint32_t *)where) = re->overall_options; + break; + + case PCRE2_INFO_ARGOPTIONS: + *((uint32_t *)where) = re->compile_options; + break; + + case PCRE2_INFO_BACKREFMAX: + *((uint32_t *)where) = re->top_backref; + break; + + case PCRE2_INFO_BSR: + *((uint32_t *)where) = re->bsr_convention; + break; + + case PCRE2_INFO_CAPTURECOUNT: + *((uint32_t *)where) = re->top_bracket; + break; + + case PCRE2_INFO_DEPTHLIMIT: + *((uint32_t *)where) = re->limit_depth; + if (re->limit_depth == UINT32_MAX) return PCRE2_ERROR_UNSET; + break; + + case PCRE2_INFO_EXTRAOPTIONS: + *((uint32_t *)where) = re->extra_options; + break; + + case PCRE2_INFO_FIRSTCODETYPE: + *((uint32_t *)where) = ((re->flags & PCRE2_FIRSTSET) != 0)? 1 : + ((re->flags & PCRE2_STARTLINE) != 0)? 2 : 0; + break; + + case PCRE2_INFO_FIRSTCODEUNIT: + *((uint32_t *)where) = ((re->flags & PCRE2_FIRSTSET) != 0)? + re->first_codeunit : 0; + break; + + case PCRE2_INFO_FIRSTBITMAP: + *((const uint8_t **)where) = ((re->flags & PCRE2_FIRSTMAPSET) != 0)? + &(re->start_bitmap[0]) : NULL; + break; + + case PCRE2_INFO_FRAMESIZE: + *((size_t *)where) = offsetof(heapframe, ovector) + + re->top_bracket * 2 * sizeof(PCRE2_SIZE); + break; + + case PCRE2_INFO_HASBACKSLASHC: + *((uint32_t *)where) = (re->flags & PCRE2_HASBKC) != 0; + break; + + case PCRE2_INFO_HASCRORLF: + *((uint32_t *)where) = (re->flags & PCRE2_HASCRORLF) != 0; + break; + + case PCRE2_INFO_HEAPLIMIT: + *((uint32_t *)where) = re->limit_heap; + if (re->limit_heap == UINT32_MAX) return PCRE2_ERROR_UNSET; + break; + + case PCRE2_INFO_JCHANGED: + *((uint32_t *)where) = (re->flags & PCRE2_JCHANGED) != 0; + break; + + case PCRE2_INFO_JITSIZE: +#ifdef SUPPORT_JIT + *((size_t *)where) = (re->executable_jit != NULL)? + PRIV(jit_get_size)(re->executable_jit) : 0; +#else + *((size_t *)where) = 0; +#endif + break; + + case PCRE2_INFO_LASTCODETYPE: + *((uint32_t *)where) = ((re->flags & PCRE2_LASTSET) != 0)? 1 : 0; + break; + + case PCRE2_INFO_LASTCODEUNIT: + *((uint32_t *)where) = ((re->flags & PCRE2_LASTSET) != 0)? + re->last_codeunit : 0; + break; + + case PCRE2_INFO_MATCHEMPTY: + *((uint32_t *)where) = (re->flags & PCRE2_MATCH_EMPTY) != 0; + break; + + case PCRE2_INFO_MATCHLIMIT: + *((uint32_t *)where) = re->limit_match; + if (re->limit_match == UINT32_MAX) return PCRE2_ERROR_UNSET; + break; + + case PCRE2_INFO_MAXLOOKBEHIND: + *((uint32_t *)where) = re->max_lookbehind; + break; + + case PCRE2_INFO_MINLENGTH: + *((uint32_t *)where) = re->minlength; + break; + + case PCRE2_INFO_NAMEENTRYSIZE: + *((uint32_t *)where) = re->name_entry_size; + break; + + case PCRE2_INFO_NAMECOUNT: + *((uint32_t *)where) = re->name_count; + break; + + case PCRE2_INFO_NAMETABLE: + *((PCRE2_SPTR *)where) = (PCRE2_SPTR)((char *)re + sizeof(pcre2_real_code)); + break; + + case PCRE2_INFO_NEWLINE: + *((uint32_t *)where) = re->newline_convention; + break; + + case PCRE2_INFO_SIZE: + *((size_t *)where) = re->blocksize; + break; + + default: return PCRE2_ERROR_BADOPTION; + } + +return 0; +} + + + +/************************************************* +* Callout enumerator * +*************************************************/ + +/* +Arguments: + code points to compiled code + callback function called for each callout block + callout_data user data passed to the callback + +Returns: 0 when successfully completed + < 0 on local error + != 0 for callback error +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_callout_enumerate(const pcre2_code *code, + int (*callback)(pcre2_callout_enumerate_block *, void *), void *callout_data) +{ +pcre2_real_code *re = (pcre2_real_code *)code; +pcre2_callout_enumerate_block cb; +PCRE2_SPTR cc; +#ifdef SUPPORT_UNICODE +BOOL utf; +#endif + +if (re == NULL) return PCRE2_ERROR_NULL; + +#ifdef SUPPORT_UNICODE +utf = (re->overall_options & PCRE2_UTF) != 0; +#endif + +/* Check that the first field in the block is the magic number. If it is not, +return with PCRE2_ERROR_BADMAGIC. */ + +if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; + +/* Check that this pattern was compiled in the correct bit mode */ + +if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE; + +cb.version = 0; +cc = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + + re->name_count * re->name_entry_size; + +while (TRUE) + { + int rc; + switch (*cc) + { + case OP_END: + return 0; + + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_STAR: + case OP_MINSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_QUERY: + case OP_MINQUERY: + case OP_UPTO: + case OP_MINUPTO: + case OP_EXACT: + case OP_POSSTAR: + case OP_POSPLUS: + case OP_POSQUERY: + case OP_POSUPTO: + case OP_STARI: + case OP_MINSTARI: + case OP_PLUSI: + case OP_MINPLUSI: + case OP_QUERYI: + case OP_MINQUERYI: + case OP_UPTOI: + case OP_MINUPTOI: + case OP_EXACTI: + case OP_POSSTARI: + case OP_POSPLUSI: + case OP_POSQUERYI: + case OP_POSUPTOI: + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTQUERY: + case OP_NOTMINQUERY: + case OP_NOTUPTO: + case OP_NOTMINUPTO: + case OP_NOTEXACT: + case OP_NOTPOSSTAR: + case OP_NOTPOSPLUS: + case OP_NOTPOSQUERY: + case OP_NOTPOSUPTO: + case OP_NOTSTARI: + case OP_NOTMINSTARI: + case OP_NOTPLUSI: + case OP_NOTMINPLUSI: + case OP_NOTQUERYI: + case OP_NOTMINQUERYI: + case OP_NOTUPTOI: + case OP_NOTMINUPTOI: + case OP_NOTEXACTI: + case OP_NOTPOSSTARI: + case OP_NOTPOSPLUSI: + case OP_NOTPOSQUERYI: + case OP_NOTPOSUPTOI: + cc += PRIV(OP_lengths)[*cc]; +#ifdef SUPPORT_UNICODE + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); +#endif + break; + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + case OP_TYPEPOSUPTO: + cc += PRIV(OP_lengths)[*cc]; +#ifdef SUPPORT_UNICODE + if (cc[-1] == OP_PROP || cc[-1] == OP_NOTPROP) cc += 2; +#endif + break; + +#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 + case OP_XCLASS: + cc += GET(cc, 1); + break; +#endif + + case OP_MARK: + case OP_COMMIT_ARG: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + cc += PRIV(OP_lengths)[*cc] + cc[1]; + break; + + case OP_CALLOUT: + cb.pattern_position = GET(cc, 1); + cb.next_item_length = GET(cc, 1 + LINK_SIZE); + cb.callout_number = cc[1 + 2*LINK_SIZE]; + cb.callout_string_offset = 0; + cb.callout_string_length = 0; + cb.callout_string = NULL; + rc = callback(&cb, callout_data); + if (rc != 0) return rc; + cc += PRIV(OP_lengths)[*cc]; + break; + + case OP_CALLOUT_STR: + cb.pattern_position = GET(cc, 1); + cb.next_item_length = GET(cc, 1 + LINK_SIZE); + cb.callout_number = 0; + cb.callout_string_offset = GET(cc, 1 + 3*LINK_SIZE); + cb.callout_string_length = + GET(cc, 1 + 2*LINK_SIZE) - (1 + 4*LINK_SIZE) - 2; + cb.callout_string = cc + (1 + 4*LINK_SIZE) + 1; + rc = callback(&cb, callout_data); + if (rc != 0) return rc; + cc += GET(cc, 1 + 2*LINK_SIZE); + break; + + default: + cc += PRIV(OP_lengths)[*cc]; + break; + } + } +} + +/* End of pcre2_pattern_info.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_script_run.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_script_run.c new file mode 100644 index 0000000000..795f09af5c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_script_run.c @@ -0,0 +1,344 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains the function for checking a script run. */ + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + +/************************************************* +* Check script run * +*************************************************/ + +/* A script run is conceptually a sequence of characters all in the same +Unicode script. However, it isn't quite that simple. There are special rules +for scripts that are commonly used together, and also special rules for digits. +This function implements the appropriate checks, which is possible only when +PCRE2 is compiled with Unicode support. The function returns TRUE if there is +no Unicode support; however, it should never be called in that circumstance +because an error is given by pcre2_compile() if a script run is called for in a +version of PCRE2 compiled without Unicode support. + +Arguments: + pgr point to the first character + endptr point after the last character + utf TRUE if in UTF mode + +Returns: TRUE if this is a valid script run +*/ + +/* These are states in the checking process. */ + +enum { SCRIPT_UNSET, /* Requirement as yet unknown */ + SCRIPT_MAP, /* Bitmap contains acceptable scripts */ + SCRIPT_HANPENDING, /* Have had only Han characters */ + SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */ + SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */ + SCRIPT_HANHANGUL /* Expect Han or Hangul */ + }; + +#define UCD_MAPSIZE (ucp_Unknown/32 + 1) +#define FULL_MAPSIZE (ucp_Script_Count/32 + 1) + +BOOL +PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) +{ +#ifdef SUPPORT_UNICODE +uint32_t require_state = SCRIPT_UNSET; +uint32_t require_map[FULL_MAPSIZE]; +uint32_t map[FULL_MAPSIZE]; +uint32_t require_digitset = 0; +uint32_t c; + +#if PCRE2_CODE_UNIT_WIDTH == 32 +(void)utf; /* Avoid compiler warning */ +#endif + +/* Any string containing fewer than 2 characters is a valid script run. */ + +if (ptr >= endptr) return TRUE; +GETCHARINCTEST(c, ptr); +if (ptr >= endptr) return TRUE; + +/* Initialize the require map. This is a full-size bitmap that has a bit for +every script, as opposed to the maps in ucd_script_sets, which only have bits +for scripts less than ucp_Unknown - those that appear in script extension +lists. */ + +for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; + +/* Scan strings of two or more characters, checking the Unicode characteristics +of each code point. There is special code for scripts that can be combined with +characters from the Han Chinese script. This may be used in conjunction with +four other scripts in these combinations: + +. Han with Hiragana and Katakana is allowed (for Japanese). +. Han with Bopomofo is allowed (for Taiwanese Mandarin). +. Han with Hangul is allowed (for Korean). + +If the first significant character's script is one of the four, the required +script type is immediately known. However, if the first significant +character's script is Han, we have to keep checking for a non-Han character. +Hence the SCRIPT_HANPENDING state. */ + +for (;;) + { + const ucd_record *ucd = GET_UCD(c); + uint32_t script = ucd->script; + + /* If the script is Unknown, the string is not a valid script run. Such + characters can only form script runs of length one (see test above). */ + + if (script == ucp_Unknown) return FALSE; + + /* A character without any script extensions whose script is Inherited or + Common is always accepted with any script. If there are extensions, the + following processing happens for all scripts. */ + + if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common)) + { + BOOL OK; + + /* Set up a full-sized map for this character that can include bits for all + scripts. Copy the scriptx map for this character (which covers those + scripts that appear in script extension lists), set the remaining values to + zero, and then, except for Common or Inherited, add this script's bit to + the map. */ + + memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t)); + memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); + if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); + + /* Handle the different checking states */ + + switch(require_state) + { + /* First significant character - it might follow Common or Inherited + characters that do not have any script extensions. */ + + case SCRIPT_UNSET: + switch(script) + { + case ucp_Han: + require_state = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_state = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_state = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_state = SCRIPT_HANHANGUL; + break; + + default: + memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); + require_state = SCRIPT_MAP; + break; + } + break; + + /* The first significant character was Han. An inspection of the Unicode + 11.0.0 files shows that there are the following types of Script Extension + list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul + scripts: + + . Bopomofo + Han + . Han + Hiragana + Katakana + . Hiragana + Katakana + . Bopopmofo + Hangul + Han + Hiragana + Katakana + + The following code tries to make sense of this. */ + +#define FOUND_BOPOMOFO 1 +#define FOUND_HIRAGANA 2 +#define FOUND_KATAKANA 4 +#define FOUND_HANGUL 8 + + case SCRIPT_HANPENDING: + if (script != ucp_Han) /* Another Han does nothing */ + { + uint32_t chspecial = 0; + + if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; + if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; + if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; + if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; + + if (chspecial == 0) return FALSE; /* Not allowed with Han */ + + if (chspecial == FOUND_BOPOMOFO) + require_state = SCRIPT_HANBOPOMOFO; + else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) + require_state = SCRIPT_HANHIRAKATA; + + /* Otherwise this character must be allowed with all of them, so remain + in the pending state. */ + } + break; + + /* Previously encountered one of the "with Han" scripts. Check that + this character is appropriate. */ + + case SCRIPT_HANHIRAKATA: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + + MAPBIT(map, ucp_Katakana) == 0) return FALSE; + break; + + case SCRIPT_HANBOPOMOFO: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; + break; + + case SCRIPT_HANHANGUL: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; + break; + + /* Previously encountered one or more characters that are allowed with a + list of scripts. */ + + case SCRIPT_MAP: + OK = FALSE; + + for (int i = 0; i < FULL_MAPSIZE; i++) + { + if ((require_map[i] & map[i]) != 0) + { + OK = TRUE; + break; + } + } + + if (!OK) return FALSE; + + /* The rest of the string must be in this script, but we have to + allow for the Han complications. */ + + switch(script) + { + case ucp_Han: + require_state = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_state = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_state = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_state = SCRIPT_HANHANGUL; + break; + + /* Compute the intersection of the required list of scripts and the + allowed scripts for this character. */ + + default: + for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; + break; + } + + break; + } + } /* End checking character's script and extensions. */ + + /* The character is in an acceptable script. We must now ensure that all + decimal digits in the string come from the same set. Some scripts (e.g. + Common, Arabic) have more than one set of decimal digits. This code does + not allow mixing sets, even within the same script. The vector called + PRIV(ucd_digit_sets)[] contains, in its first element, the number of + following elements, and then, in ascending order, the code points of the + '9' characters in every set of 10 digits. Each set is identified by the + offset in the vector of its '9' character. An initial check of the first + value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ + + if (ucd->chartype == ucp_Nd) + { + uint32_t digitset; + + if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + { + int mid; + int bot = 1; + int top = PRIV(ucd_digit_sets)[0]; + for (;;) + { + if (top <= bot + 1) /* <= rather than == is paranoia */ + { + digitset = top; + break; + } + mid = (top + bot) / 2; + if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; + } + } + + /* A required value of 0 means "unset". */ + + if (require_digitset == 0) require_digitset = digitset; + else if (digitset != require_digitset) return FALSE; + } /* End digit handling */ + + /* If we haven't yet got to the end, pick up the next character. */ + + if (ptr >= endptr) return TRUE; + GETCHARINCTEST(c, ptr); + } /* End checking loop */ + +#else /* NOT SUPPORT_UNICODE */ +(void)ptr; +(void)endptr; +(void)utf; +return TRUE; +#endif /* SUPPORT_UNICODE */ +} + +/* End of pcre2_script_run.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_serialize.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_serialize.c new file mode 100644 index 0000000000..2f79d9ee17 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_serialize.c @@ -0,0 +1,286 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains functions for serializing and deserializing +a sequence of compiled codes. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + + +#include "regexp/pcre2/pcre2_internal.h" + +/* Magic number to provide a small check against being handed junk. */ + +#define SERIALIZED_DATA_MAGIC 0x50523253u + +/* Deserialization is limited to the current PCRE version and +character width. */ + +#define SERIALIZED_DATA_VERSION \ + ((PCRE2_MAJOR) | ((PCRE2_MINOR) << 16)) + +#define SERIALIZED_DATA_CONFIG \ + (sizeof(PCRE2_UCHAR) | ((sizeof(void*)) << 8) | ((sizeof(PCRE2_SIZE)) << 16)) + + + +/************************************************* +* Serialize compiled patterns * +*************************************************/ + +PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION +pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes, + uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, + pcre2_general_context *gcontext) +{ +uint8_t *bytes; +uint8_t *dst_bytes; +int32_t i; +PCRE2_SIZE total_size; +const pcre2_real_code *re; +const uint8_t *tables; +pcre2_serialized_data *data; + +const pcre2_memctl *memctl = (gcontext != NULL) ? + &gcontext->memctl : &PRIV(default_compile_context).memctl; + +if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL) + return PCRE2_ERROR_NULL; + +if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; + +/* Compute total size. */ +total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH; +tables = NULL; + +for (i = 0; i < number_of_codes; i++) + { + if (codes[i] == NULL) return PCRE2_ERROR_NULL; + re = (const pcre2_real_code *)(codes[i]); + if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; + if (tables == NULL) + tables = re->tables; + else if (tables != re->tables) + return PCRE2_ERROR_MIXEDTABLES; + total_size += re->blocksize; + } + +/* Initialize the byte stream. */ +bytes = memctl->malloc(total_size + sizeof(pcre2_memctl), memctl->memory_data); +if (bytes == NULL) return PCRE2_ERROR_NOMEMORY; + +/* The controller is stored as a hidden parameter. */ +memcpy(bytes, memctl, sizeof(pcre2_memctl)); +bytes += sizeof(pcre2_memctl); + +data = (pcre2_serialized_data *)bytes; +data->magic = SERIALIZED_DATA_MAGIC; +data->version = SERIALIZED_DATA_VERSION; +data->config = SERIALIZED_DATA_CONFIG; +data->number_of_codes = number_of_codes; + +/* Copy all compiled code data. */ +dst_bytes = bytes + sizeof(pcre2_serialized_data); +memcpy(dst_bytes, tables, TABLES_LENGTH); +dst_bytes += TABLES_LENGTH; + +for (i = 0; i < number_of_codes; i++) + { + re = (const pcre2_real_code *)(codes[i]); + (void)memcpy(dst_bytes, (char *)re, re->blocksize); + + /* Certain fields in the compiled code block are re-set during + deserialization. In order to ensure that the serialized data stream is always + the same for the same pattern, set them to zero here. We can't assume the + copy of the pattern is correctly aligned for accessing the fields as part of + a structure. Note the use of sizeof(void *) in the second of these, to + specify the size of a pointer. If sizeof(uint8_t *) is used (tables is a + pointer to uint8_t), gcc gives a warning because the first argument is also a + pointer to uint8_t. Casting the first argument to (void *) can stop this, but + it didn't stop Coverity giving the same complaint. */ + + (void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0, + sizeof(pcre2_memctl)); + (void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0, + sizeof(void *)); + (void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0, + sizeof(void *)); + + dst_bytes += re->blocksize; + } + +*serialized_bytes = bytes; +*serialized_size = total_size; +return number_of_codes; +} + + +/************************************************* +* Deserialize compiled patterns * +*************************************************/ + +PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION +pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes, + const uint8_t *bytes, pcre2_general_context *gcontext) +{ +const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes; +const pcre2_memctl *memctl = (gcontext != NULL) ? + &gcontext->memctl : &PRIV(default_compile_context).memctl; + +const uint8_t *src_bytes; +pcre2_real_code *dst_re; +uint8_t *tables; +int32_t i, j; + +/* Sanity checks. */ + +if (data == NULL || codes == NULL) return PCRE2_ERROR_NULL; +if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; +if (data->number_of_codes <= 0) return PCRE2_ERROR_BADSERIALIZEDDATA; +if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; +if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; +if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; + +if (number_of_codes > data->number_of_codes) + number_of_codes = data->number_of_codes; + +src_bytes = bytes + sizeof(pcre2_serialized_data); + +/* Decode tables. The reference count for the tables is stored immediately +following them. */ + +tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data); +if (tables == NULL) return PCRE2_ERROR_NOMEMORY; + +memcpy(tables, src_bytes, TABLES_LENGTH); +*(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes; +src_bytes += TABLES_LENGTH; + +/* Decode the byte stream. We must not try to read the size from the compiled +code block in the stream, because it might be unaligned, which causes errors on +hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type +of the blocksize field is given its own name to ensure that it is the same here +as in the block. */ + +for (i = 0; i < number_of_codes; i++) + { + CODE_BLOCKSIZE_TYPE blocksize; + memcpy(&blocksize, src_bytes + offsetof(pcre2_real_code, blocksize), + sizeof(CODE_BLOCKSIZE_TYPE)); + if (blocksize <= sizeof(pcre2_real_code)) + return PCRE2_ERROR_BADSERIALIZEDDATA; + + /* The allocator provided by gcontext replaces the original one. */ + + dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize, + (pcre2_memctl *)gcontext); + if (dst_re == NULL) + { + memctl->free(tables, memctl->memory_data); + for (j = 0; j < i; j++) + { + memctl->free(codes[j], memctl->memory_data); + codes[j] = NULL; + } + return PCRE2_ERROR_NOMEMORY; + } + + /* The new allocator must be preserved. */ + + memcpy(((uint8_t *)dst_re) + sizeof(pcre2_memctl), + src_bytes + sizeof(pcre2_memctl), blocksize - sizeof(pcre2_memctl)); + if (dst_re->magic_number != MAGIC_NUMBER || + dst_re->name_entry_size > MAX_NAME_SIZE + IMM2_SIZE + 1 || + dst_re->name_count > MAX_NAME_COUNT) + { + memctl->free(dst_re, memctl->memory_data); + return PCRE2_ERROR_BADSERIALIZEDDATA; + } + + /* At the moment only one table is supported. */ + + dst_re->tables = tables; + dst_re->executable_jit = NULL; + dst_re->flags |= PCRE2_DEREF_TABLES; + + codes[i] = dst_re; + src_bytes += blocksize; + } + +return number_of_codes; +} + + +/************************************************* +* Get the number of serialized patterns * +*************************************************/ + +PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION +pcre2_serialize_get_number_of_codes(const uint8_t *bytes) +{ +const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes; + +if (data == NULL) return PCRE2_ERROR_NULL; +if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; +if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; +if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; + +return data->number_of_codes; +} + + +/************************************************* +* Free the allocated stream * +*************************************************/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_serialize_free(uint8_t *bytes) +{ +if (bytes != NULL) + { + pcre2_memctl *memctl = (pcre2_memctl *)(bytes - sizeof(pcre2_memctl)); + memctl->free(memctl, memctl->memory_data); + } +} + +/* End of pcre2_serialize.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_string_utils.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_string_utils.c new file mode 100644 index 0000000000..8257cc6d59 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_string_utils.c @@ -0,0 +1,237 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2018-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains internal functions for comparing and finding the length +of strings. These are used instead of strcmp() etc because the standard +functions work only on 8-bit data. */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + +/************************************************* +* Emulated memmove() for systems without it * +*************************************************/ + +/* This function can make use of bcopy() if it is available. Otherwise do it by +steam, as there some non-Unix environments that lack both memmove() and +bcopy(). */ + +#if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE) +void * +PRIV(memmove)(void *d, const void *s, size_t n) +{ +#ifdef HAVE_BCOPY +bcopy(s, d, n); +return d; +#else +size_t i; +unsigned char *dest = (unsigned char *)d; +const unsigned char *src = (const unsigned char *)s; +if (dest > src) + { + dest += n; + src += n; + for (i = 0; i < n; ++i) *(--dest) = *(--src); + return (void *)dest; + } +else + { + for (i = 0; i < n; ++i) *dest++ = *src++; + return (void *)(dest - n); + } +#endif /* not HAVE_BCOPY */ +} +#endif /* not VPCOMPAT && not HAVE_MEMMOVE */ + + +/************************************************* +* Compare two zero-terminated PCRE2 strings * +*************************************************/ + +/* +Arguments: + str1 first string + str2 second string + +Returns: 0, 1, or -1 +*/ + +int +PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2) +{ +PCRE2_UCHAR c1, c2; +while (*str1 != '\0' || *str2 != '\0') + { + c1 = *str1++; + c2 = *str2++; + if (c1 != c2) return ((c1 > c2) << 1) - 1; + } +return 0; +} + + +/************************************************* +* Compare zero-terminated PCRE2 & 8-bit strings * +*************************************************/ + +/* As the 8-bit string is almost always a literal, its type is specified as +const char *. + +Arguments: + str1 first string + str2 second string + +Returns: 0, 1, or -1 +*/ + +int +PRIV(strcmp_c8)(PCRE2_SPTR str1, const char *str2) +{ +PCRE2_UCHAR c1, c2; +while (*str1 != '\0' || *str2 != '\0') + { + c1 = *str1++; + c2 = *str2++; + if (c1 != c2) return ((c1 > c2) << 1) - 1; + } +return 0; +} + + +/************************************************* +* Compare two PCRE2 strings, given a length * +*************************************************/ + +/* +Arguments: + str1 first string + str2 second string + len the length + +Returns: 0, 1, or -1 +*/ + +int +PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len) +{ +PCRE2_UCHAR c1, c2; +for (; len > 0; len--) + { + c1 = *str1++; + c2 = *str2++; + if (c1 != c2) return ((c1 > c2) << 1) - 1; + } +return 0; +} + + +/************************************************* +* Compare PCRE2 string to 8-bit string by length * +*************************************************/ + +/* As the 8-bit string is almost always a literal, its type is specified as +const char *. + +Arguments: + str1 first string + str2 second string + len the length + +Returns: 0, 1, or -1 +*/ + +int +PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len) +{ +PCRE2_UCHAR c1, c2; +for (; len > 0; len--) + { + c1 = *str1++; + c2 = *str2++; + if (c1 != c2) return ((c1 > c2) << 1) - 1; + } +return 0; +} + + +/************************************************* +* Find the length of a PCRE2 string * +*************************************************/ + +/* +Argument: the string +Returns: the length +*/ + +PCRE2_SIZE +PRIV(strlen)(PCRE2_SPTR str) +{ +PCRE2_SIZE c = 0; +while (*str++ != 0) c++; +return c; +} + + +/************************************************* +* Copy 8-bit 0-terminated string to PCRE2 string * +*************************************************/ + +/* Arguments: + str1 buffer to receive the string + str2 8-bit string to be copied + +Returns: the number of code units used (excluding trailing zero) +*/ + +PCRE2_SIZE +PRIV(strcpy_c8)(PCRE2_UCHAR *str1, const char *str2) +{ +PCRE2_UCHAR *t = str1; +while (*str2 != 0) *t++ = *str2++; +*t = 0; +return t - str1; +} + +/* End of pcre2_string_utils.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_study.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_study.c new file mode 100644 index 0000000000..bd77ac9021 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_study.c @@ -0,0 +1,1825 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains functions for scanning a compiled pattern and +collecting data (e.g. minimum matching length). */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + +/* The maximum remembered capturing brackets minimum. */ + +#define MAX_CACHE_BACKREF 128 + +/* Set a bit in the starting code unit bit map. */ + +#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7)) + +/* Returns from set_start_bits() */ + +enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP }; + + +/************************************************* +* Find the minimum subject length for a group * +*************************************************/ + +/* Scan a parenthesized group and compute the minimum length of subject that +is needed to match it. This is a lower bound; it does not mean there is a +string of that length that matches. In UTF mode, the result is in characters +rather than code units. The field in a compiled pattern for storing the minimum +length is 16-bits long (on the grounds that anything longer than that is +pathological), so we give up when we reach that amount. This also means that +integer overflow for really crazy patterns cannot happen. + +Backreference minimum lengths are cached to speed up multiple references. This +function is called only when the highest back reference in the pattern is less +than or equal to MAX_CACHE_BACKREF, which is one less than the size of the +caching vector. The zeroth element contains the number of the highest set +value. + +Arguments: + re compiled pattern block + code pointer to start of group (the bracket) + startcode pointer to start of the whole pattern's code + utf UTF flag + recurses chain of recurse_check to catch mutual recursion + countptr pointer to call count (to catch over complexity) + backref_cache vector for caching back references. + +This function is no longer called when the pattern contains (*ACCEPT); however, +the old code for returning -1 is retained, just in case. + +Returns: the minimum length + -1 \C in UTF-8 mode + or (*ACCEPT) + or pattern too complicated + -2 internal error (missing capturing bracket) + -3 internal error (opcode not listed) +*/ + +static int +find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, + PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr, + int *backref_cache) +{ +int length = -1; +int branchlength = 0; +int prev_cap_recno = -1; +int prev_cap_d = 0; +int prev_recurse_recno = -1; +int prev_recurse_d = 0; +uint32_t once_fudge = 0; +BOOL had_recurse = FALSE; +BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0; +PCRE2_SPTR nextbranch = code + GET(code, 1); +PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; +recurse_check this_recurse; + +/* If this is a "could be empty" group, its minimum length is 0. */ + +if (*code >= OP_SBRA && *code <= OP_SCOND) return 0; + +/* Skip over capturing bracket number */ + +if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE; + +/* A large and/or complex regex can take too long to process. */ + +if ((*countptr)++ > 1000) return -1; + +/* Scan along the opcodes for this branch. If we get to the end of the branch, +check the length against that of the other branches. If the accumulated length +passes 16-bits, reset to that value and skip the rest of the branch. */ + +for (;;) + { + int d, min, recno; + PCRE2_UCHAR op, *cs, *ce; + + if (branchlength >= UINT16_MAX) + { + branchlength = UINT16_MAX; + cc = (PCRE2_UCHAR *)nextbranch; + } + + op = *cc; + switch (op) + { + case OP_COND: + case OP_SCOND: + + /* If there is only one branch in a condition, the implied branch has zero + length, so we don't add anything. This covers the DEFINE "condition" + automatically. If there are two branches we can treat it the same as any + other non-capturing subpattern. */ + + cs = cc + GET(cc, 1); + if (*cs != OP_ALT) + { + cc = cs + 1 + LINK_SIZE; + break; + } + goto PROCESS_NON_CAPTURE; + + case OP_BRA: + /* There's a special case of OP_BRA, when it is wrapped round a repeated + OP_RECURSE. We'd like to process the latter at this level so that + remembering the value works for repeated cases. So we do nothing, but + set a fudge value to skip over the OP_KET after the recurse. */ + + if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) + { + once_fudge = 1 + LINK_SIZE; + cc += 1 + LINK_SIZE; + break; + } + /* Fall through */ + + case OP_ONCE: + case OP_SCRIPT_RUN: + case OP_SBRA: + case OP_BRAPOS: + case OP_SBRAPOS: + PROCESS_NON_CAPTURE: + d = find_minlength(re, cc, startcode, utf, recurses, countptr, + backref_cache); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* To save time for repeated capturing subpatterns, we remember the + length of the previous one. Unfortunately we can't do the same for + the unnumbered ones above. Nor can we do this if (?| is present in the + pattern because captures with the same number are not then identical. */ + + case OP_CBRA: + case OP_SCBRA: + case OP_CBRAPOS: + case OP_SCBRAPOS: + recno = (int)GET2(cc, 1+LINK_SIZE); + if (dupcapused || recno != prev_cap_recno) + { + prev_cap_recno = recno; + prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr, + backref_cache); + if (prev_cap_d < 0) return prev_cap_d; + } + branchlength += prev_cap_d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* ACCEPT makes things far too complicated; we have to give up. In fact, + from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not + used. However, leave the code in place, just in case. */ + + case OP_ACCEPT: + case OP_ASSERT_ACCEPT: + return -1; + + /* Reached end of a branch; if it's a ket it is the end of a nested + call. If it's ALT it is an alternation in a nested call. If it is END it's + the end of the outer call. All can be handled by the same code. If the + length of any branch is zero, there is no need to scan any subsequent + branches. */ + + case OP_ALT: + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_KETRPOS: + case OP_END: + if (length < 0 || (!had_recurse && branchlength < length)) + length = branchlength; + if (op != OP_ALT || length == 0) return length; + nextbranch = cc + GET(cc, 1); + cc += 1 + LINK_SIZE; + branchlength = 0; + had_recurse = FALSE; + break; + + /* Skip over assertive subpatterns */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + case OP_ASSERT_NA: + case OP_ASSERTBACK_NA: + do cc += GET(cc, 1); while (*cc == OP_ALT); + /* Fall through */ + + /* Skip over things that don't match chars */ + + case OP_REVERSE: + case OP_CREF: + case OP_DNCREF: + case OP_RREF: + case OP_DNRREF: + case OP_FALSE: + case OP_TRUE: + case OP_CALLOUT: + case OP_SOD: + case OP_SOM: + case OP_EOD: + case OP_EODN: + case OP_CIRC: + case OP_CIRCM: + case OP_DOLL: + case OP_DOLLM: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + cc += PRIV(OP_lengths)[*cc]; + break; + + case OP_CALLOUT_STR: + cc += GET(cc, 1 + 2*LINK_SIZE); + break; + + /* Skip over a subpattern that has a {0} or {0,x} quantifier */ + + case OP_BRAZERO: + case OP_BRAMINZERO: + case OP_BRAPOSZERO: + case OP_SKIPZERO: + cc += PRIV(OP_lengths)[*cc]; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* Handle literal characters and + repetitions */ + + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_PLUS: + case OP_PLUSI: + case OP_MINPLUS: + case OP_MINPLUSI: + case OP_POSPLUS: + case OP_POSPLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + branchlength++; + cc += 2; +#ifdef SUPPORT_UNICODE + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); +#endif + break; + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + branchlength++; + cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; + break; + + /* Handle exact repetitions. The count is already in characters, but we + may need to skip over a multibyte character in UTF mode. */ + + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + branchlength += GET2(cc,1); + cc += 2 + IMM2_SIZE; +#ifdef SUPPORT_UNICODE + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); +#endif + break; + + case OP_TYPEEXACT: + branchlength += GET2(cc,1); + cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP + || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); + break; + + /* Handle single-char non-literal matchers */ + + case OP_PROP: + case OP_NOTPROP: + cc += 2; + /* Fall through */ + + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + case OP_ALLANY: + case OP_EXTUNI: + case OP_HSPACE: + case OP_NOT_HSPACE: + case OP_VSPACE: + case OP_NOT_VSPACE: + branchlength++; + cc++; + break; + + /* "Any newline" might match two characters, but it also might match just + one. */ + + case OP_ANYNL: + branchlength += 1; + cc++; + break; + + /* The single-byte matcher means we can't proceed in UTF mode. (In + non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever + appear, but leave the code, just in case.) */ + + case OP_ANYBYTE: +#ifdef SUPPORT_UNICODE + if (utf) return -1; +#endif + branchlength++; + cc++; + break; + + /* For repeated character types, we have to test for \p and \P, which have + an extra two bytes of parameters. */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSQUERY: + if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; + cc += PRIV(OP_lengths)[op]; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + if (cc[1 + IMM2_SIZE] == OP_PROP + || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; + cc += PRIV(OP_lengths)[op]; + break; + + /* Check a class for variable quantification */ + + case OP_CLASS: + case OP_NCLASS: +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + /* The original code caused an unsigned overflow in 64 bit systems, + so now we use a conditional statement. */ + if (op == OP_XCLASS) + cc += GET(cc, 1); + else + cc += PRIV(OP_lengths)[OP_CLASS]; +#else + cc += PRIV(OP_lengths)[OP_CLASS]; +#endif + + switch (*cc) + { + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSPLUS: + branchlength++; + /* Fall through */ + + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + branchlength += GET2(cc,1); + cc += 1 + 2 * IMM2_SIZE; + break; + + default: + branchlength++; + break; + } + break; + + /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same + way: we find the minimum length for the subpattern. A recursion + (backreference or subroutine) causes an a flag to be set that causes the + length of this branch to be ignored. The logic is that a recursion can only + make sense if there is another alternative that stops the recursing. That + will provide the minimum length (when no recursion happens). + + If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket + matches an empty string (by default it causes a matching failure), so in + that case we must set the minimum length to zero. + + For backreferenes, if duplicate numbers are present in the pattern we check + for a reference to a duplicate. If it is, we don't know which version will + be referenced, so we have to set the minimum length to zero. */ + + /* Duplicate named pattern back reference. */ + + case OP_DNREF: + case OP_DNREFI: + if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + { + int count = GET2(cc, 1+IMM2_SIZE); + PCRE2_UCHAR *slot = + (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + + GET2(cc, 1) * re->name_entry_size; + + d = INT_MAX; + + /* Scan all groups with the same name; find the shortest. */ + + while (count-- > 0) + { + int dd, i; + recno = GET2(slot, 0); + + if (recno <= backref_cache[0] && backref_cache[recno] >= 0) + dd = backref_cache[recno]; + else + { + ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + + dd = 0; + if (!dupcapused || + (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL) + { + if (cc > cs && cc < ce) /* Simple recursion */ + { + had_recurse = TRUE; + } + else + { + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) + if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + had_recurse = TRUE; + } + else + { + this_recurse.prev = recurses; /* No recursion */ + this_recurse.group = cs; + dd = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr, backref_cache); + if (dd < 0) return dd; + } + } + } + + backref_cache[recno] = dd; + for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; + backref_cache[0] = recno; + } + + if (dd < d) d = dd; + if (d <= 0) break; /* No point looking at any more */ + slot += re->name_entry_size; + } + } + else d = 0; + cc += 1 + 2*IMM2_SIZE; + goto REPEAT_BACK_REFERENCE; + + /* Single back reference by number. References by name are converted to by + number when there is no duplication. */ + + case OP_REF: + case OP_REFI: + recno = GET2(cc, 1); + if (recno <= backref_cache[0] && backref_cache[recno] >= 0) + d = backref_cache[recno]; + else + { + int i; + d = 0; + + if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + { + ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + + if (!dupcapused || + (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL) + { + if (cc > cs && cc < ce) /* Simple recursion */ + { + had_recurse = TRUE; + } + else + { + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + had_recurse = TRUE; + } + else /* No recursion */ + { + this_recurse.prev = recurses; + this_recurse.group = cs; + d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, + backref_cache); + if (d < 0) return d; + } + } + } + } + + backref_cache[recno] = d; + for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; + backref_cache[0] = recno; + } + + cc += 1 + IMM2_SIZE; + + /* Handle repeated back references */ + + REPEAT_BACK_REFERENCE: + switch (*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: + min = 0; + cc++; + break; + + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSPLUS: + min = 1; + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + min = GET2(cc, 1); + cc += 1 + 2 * IMM2_SIZE; + break; + + default: + min = 1; + break; + } + + /* Take care not to overflow: (1) min and d are ints, so check that their + product is not greater than INT_MAX. (2) branchlength is limited to + UINT16_MAX (checked at the top of the loop). */ + + if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d) + branchlength = UINT16_MAX; + else branchlength += min * d; + break; + + /* Recursion always refers to the first occurrence of a subpattern with a + given number. Therefore, we can always make use of caching, even when the + pattern contains multiple subpatterns with the same number. */ + + case OP_RECURSE: + cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); + recno = GET2(cs, 1+LINK_SIZE); + if (recno == prev_recurse_recno) + { + branchlength += prev_recurse_d; + } + else + { + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) /* Simple recursion */ + had_recurse = TRUE; + else + { + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + had_recurse = TRUE; + else + { + this_recurse.prev = recurses; + this_recurse.group = cs; + prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr, backref_cache); + if (prev_recurse_d < 0) return prev_recurse_d; + prev_recurse_recno = recno; + branchlength += prev_recurse_d; + } + } + } + cc += 1 + LINK_SIZE + once_fudge; + once_fudge = 0; + break; + + /* Anything else does not or need not match a character. We can get the + item's length from the table, but for those that can match zero occurrences + of a character, we must take special action for UTF-8 characters. As it + happens, the "NOT" versions of these opcodes are used at present only for + ASCII characters, so they could be omitted from this list. However, in + future that may change, so we include them here so as not to leave a + gotcha for a future maintainer. */ + + case OP_UPTO: + case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_MINUPTO: + case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_POSUPTO: + case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + + case OP_STAR: + case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_MINSTAR: + case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_POSSTAR: + case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + + case OP_QUERY: + case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_MINQUERY: + case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_POSQUERY: + case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + + cc += PRIV(OP_lengths)[op]; +#ifdef SUPPORT_UNICODE + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); +#endif + break; + + /* Skip these, but we need to add in the name length. */ + + case OP_MARK: + case OP_COMMIT_ARG: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + cc += PRIV(OP_lengths)[op] + cc[1]; + break; + + /* The remaining opcodes are just skipped over. */ + + case OP_CLOSE: + case OP_COMMIT: + case OP_FAIL: + case OP_PRUNE: + case OP_SET_SOM: + case OP_SKIP: + case OP_THEN: + cc += PRIV(OP_lengths)[op]; + break; + + /* This should not occur: we list all opcodes explicitly so that when + new ones get added they are properly considered. */ + + default: + return -3; + } + } +/* Control never gets here */ +} + + + +/************************************************* +* Set a bit and maybe its alternate case * +*************************************************/ + +/* Given a character, set its first code unit's bit in the table, and also the +corresponding bit for the other version of a letter if we are caseless. + +Arguments: + re points to the regex block + p points to the first code unit of the character + caseless TRUE if caseless + utf TRUE for UTF mode + ucp TRUE for UCP mode + +Returns: pointer after the character +*/ + +static PCRE2_SPTR +set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, + BOOL ucp) +{ +uint32_t c = *p++; /* First code unit */ + +(void)utf; /* Stop compiler warnings when UTF not supported */ +(void)ucp; + +/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for +0xff. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 +if (c > 0xff) SET_BIT(0xff); else +#endif + +SET_BIT(c); + +/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find +the end of the character, even when caseless. */ + +#ifdef SUPPORT_UNICODE +if (utf) + { +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (c >= 0xc0) GETUTF8INC(c, p); +#elif PCRE2_CODE_UNIT_WIDTH == 16 + if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p); +#endif + } +#endif /* SUPPORT_UNICODE */ + +/* If caseless, handle the other case of the character. */ + +if (caseless) + { +#ifdef SUPPORT_UNICODE + if (utf || ucp) + { + c = UCD_OTHERCASE(c); +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf) + { + PCRE2_UCHAR buff[6]; + (void)PRIV(ord2utf)(c, buff); + SET_BIT(buff[0]); + } + else if (c < 256) SET_BIT(c); +#else /* 16-bit or 32-bit mode */ + if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); +#endif + } + + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF or UCP */ + + if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); + } + +return p; +} + + + +/************************************************* +* Set bits for a positive character type * +*************************************************/ + +/* This function sets starting bits for a character type. In UTF-8 mode, we can +only do a direct setting for bytes less than 128, as otherwise there can be +confusion with bytes in the middle of UTF-8 characters. In a "traditional" +environment, the tables will only recognize ASCII characters anyway, but in at +least one Windows environment, some higher bytes bits were set in the tables. +So we deal with that case by considering the UTF-8 encoding. + +Arguments: + re the regex block + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + +Returns: nothing +*/ + +static void +set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) +{ +uint32_t c; +for (c = 0; c < table_limit; c++) + re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type]; +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 +if (table_limit == 32) return; +for (c = 128; c < 256; c++) + { + if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0) + { + PCRE2_UCHAR buff[6]; + (void)PRIV(ord2utf)(c, buff); + SET_BIT(buff[0]); + } + } +#endif /* UTF-8 */ +} + + +/************************************************* +* Set bits for a negative character type * +*************************************************/ + +/* This function sets starting bits for a negative character type such as \D. +In UTF-8 mode, we can only do a direct setting for bytes less than 128, as +otherwise there can be confusion with bytes in the middle of UTF-8 characters. +Unlike in the positive case, where we can set appropriate starting bits for +specific high-valued UTF-8 characters, in this case we have to set the bits for +all high-valued characters. The lowest is 0xc2, but we overkill by starting at +0xc0 (192) for simplicity. + +Arguments: + re the regex block + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + +Returns: nothing +*/ + +static void +set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) +{ +uint32_t c; +for (c = 0; c < table_limit; c++) + re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type])); +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 +if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff; +#endif +} + + + +/************************************************* +* Create bitmap of starting code units * +*************************************************/ + +/* This function scans a compiled unanchored expression recursively and +attempts to build a bitmap of the set of possible starting code units whose +values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause +the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode +we pass a value of 16 rather than 32 as the final argument. (See comments in +those functions for the reason.) + +The SSB_CONTINUE return is useful for parenthesized groups in patterns such as +(a*)b where the group provides some optional starting code units but scanning +must continue at the outer level to find at least one mandatory code unit. At +the outermost level, this function fails unless the result is SSB_DONE. + +We restrict recursion (for nested groups) to 1000 to avoid stack overflow +issues. + +Arguments: + re points to the compiled regex block + code points to an expression + utf TRUE if in UTF mode + ucp TRUE if in UCP mode + depthptr pointer to recurse depth + +Returns: SSB_FAIL => Failed to find any starting code units + SSB_DONE => Found mandatory starting code units + SSB_CONTINUE => Found optional starting code units + SSB_UNKNOWN => Hit an unrecognized opcode + SSB_TOODEEP => Recursion is too deep +*/ + +static int +set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp, + int *depthptr) +{ +uint32_t c; +int yield = SSB_DONE; + +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 +int table_limit = utf? 16:32; +#else +int table_limit = 32; +#endif + +*depthptr += 1; +if (*depthptr > 1000) return SSB_TOODEEP; + +do + { + BOOL try_next = TRUE; + PCRE2_SPTR tcode = code + 1 + LINK_SIZE; + + if (*code == OP_CBRA || *code == OP_SCBRA || + *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; + + while (try_next) /* Loop for items in this branch */ + { + int rc; + uint8_t *classmap = NULL; +#ifdef SUPPORT_WIDE_CHARS + PCRE2_UCHAR xclassflags; +#endif + + switch(*tcode) + { + /* If we reach something we don't understand, it means a new opcode has + been created that hasn't been added to this function. Hopefully this + problem will be discovered during testing. */ + + default: + return SSB_UNKNOWN; + + /* Fail for a valid opcode that implies no starting bits. */ + + case OP_ACCEPT: + case OP_ASSERT_ACCEPT: + case OP_ALLANY: + case OP_ANY: + case OP_ANYBYTE: + case OP_CIRCM: + case OP_CLOSE: + case OP_COMMIT: + case OP_COMMIT_ARG: + case OP_COND: + case OP_CREF: + case OP_FALSE: + case OP_TRUE: + case OP_DNCREF: + case OP_DNREF: + case OP_DNREFI: + case OP_DNRREF: + case OP_DOLL: + case OP_DOLLM: + case OP_END: + case OP_EOD: + case OP_EODN: + case OP_EXTUNI: + case OP_FAIL: + case OP_MARK: + case OP_NOT: + case OP_NOTEXACT: + case OP_NOTEXACTI: + case OP_NOTI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + case OP_NOTPROP: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_NOT_HSPACE: + case OP_NOT_VSPACE: + case OP_PRUNE: + case OP_PRUNE_ARG: + case OP_RECURSE: + case OP_REF: + case OP_REFI: + case OP_REVERSE: + case OP_RREF: + case OP_SCOND: + case OP_SET_SOM: + case OP_SKIP: + case OP_SKIP_ARG: + case OP_SOD: + case OP_SOM: + case OP_THEN: + case OP_THEN_ARG: + return SSB_FAIL; + + /* OP_CIRC happens only at the start of an anchored branch (multiline ^ + uses OP_CIRCM). Skip over it. */ + + case OP_CIRC: + tcode += PRIV(OP_lengths)[OP_CIRC]; + break; + + /* A "real" property test implies no starting bits, but the fake property + PT_CLIST identifies a list of characters. These lists are short, as they + are used for characters with more than one "other case", so there is no + point in recognizing them for OP_NOTPROP. */ + + case OP_PROP: + if (tcode[1] != PT_CLIST) return SSB_FAIL; + { + const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2]; + while ((c = *p++) < NOTACHAR) + { +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (utf) + { + PCRE2_UCHAR buff[6]; + (void)PRIV(ord2utf)(c, buff); + c = buff[0]; + } +#endif + if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); + } + } + try_next = FALSE; + break; + + /* We can ignore word boundary tests. */ + + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + tcode++; + break; + + /* If we hit a bracket or a positive lookahead assertion, recurse to set + bits from within the subpattern. If it can't find anything, we have to + give up. If it finds some mandatory character(s), we are done for this + branch. Otherwise, carry on scanning after the subpattern. */ + + case OP_BRA: + case OP_SBRA: + case OP_CBRA: + case OP_SCBRA: + case OP_BRAPOS: + case OP_SBRAPOS: + case OP_CBRAPOS: + case OP_SCBRAPOS: + case OP_ONCE: + case OP_SCRIPT_RUN: + case OP_ASSERT: + case OP_ASSERT_NA: + rc = set_start_bits(re, tcode, utf, ucp, depthptr); + if (rc == SSB_DONE) + { + try_next = FALSE; + } + else if (rc == SSB_CONTINUE) + { + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + } + else return rc; /* FAIL, UNKNOWN, or TOODEEP */ + break; + + /* If we hit ALT or KET, it means we haven't found anything mandatory in + this branch, though we might have found something optional. For ALT, we + continue with the next alternative, but we have to arrange that the final + result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, + return SSB_CONTINUE: if this is the top level, that indicates failure, + but after a nested subpattern, it causes scanning to continue. */ + + case OP_ALT: + yield = SSB_CONTINUE; + try_next = FALSE; + break; + + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_KETRPOS: + return SSB_CONTINUE; + + /* Skip over callout */ + + case OP_CALLOUT: + tcode += PRIV(OP_lengths)[OP_CALLOUT]; + break; + + case OP_CALLOUT_STR: + tcode += GET(tcode, 1 + 2*LINK_SIZE); + break; + + /* Skip over lookbehind and negative lookahead assertions */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + case OP_ASSERTBACK_NA: + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + break; + + /* BRAZERO does the bracket, but carries on. */ + + case OP_BRAZERO: + case OP_BRAMINZERO: + case OP_BRAPOSZERO: + rc = set_start_bits(re, ++tcode, utf, ucp, depthptr); + if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc; + do tcode += GET(tcode,1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + break; + + /* SKIPZERO skips the bracket. */ + + case OP_SKIPZERO: + tcode++; + do tcode += GET(tcode,1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + break; + + /* Single-char * or ? sets the bit and tries the next item */ + + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp); + break; + + case OP_STARI: + case OP_MINSTARI: + case OP_POSSTARI: + case OP_QUERYI: + case OP_MINQUERYI: + case OP_POSQUERYI: + tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp); + break; + + /* Single-char upto sets the bit and tries the next */ + + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp); + break; + + case OP_UPTOI: + case OP_MINUPTOI: + case OP_POSUPTOI: + tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp); + break; + + /* At least one single char sets the bit and stops */ + + case OP_EXACT: + tcode += IMM2_SIZE; + /* Fall through */ + case OP_CHAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp); + try_next = FALSE; + break; + + case OP_EXACTI: + tcode += IMM2_SIZE; + /* Fall through */ + case OP_CHARI: + case OP_PLUSI: + case OP_MINPLUSI: + case OP_POSPLUSI: + (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp); + try_next = FALSE; + break; + + /* Special spacing and line-terminating items. These recognize specific + lists of characters. The difference between VSPACE and ANYNL is that the + latter can match the two-character CRLF sequence, but that is not + relevant for finding the first character, so their code here is + identical. */ + + case OP_HSPACE: + SET_BIT(CHAR_HT); + SET_BIT(CHAR_SPACE); + + /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set + the bits for 0xA0 and for code units >= 255, independently of UTF. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + SET_BIT(0xA0); + SET_BIT(0xFF); +#else + /* For the 8-bit library in UTF-8 mode, set the bits for the first code + units of horizontal space characters. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + SET_BIT(0xC2); /* For U+00A0 */ + SET_BIT(0xE1); /* For U+1680, U+180E */ + SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ + SET_BIT(0xE3); /* For U+3000 */ + } + else +#endif + /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless + the code is EBCDIC. */ + { +#ifndef EBCDIC + SET_BIT(0xA0); +#endif /* Not EBCDIC */ + } +#endif /* 8-bit support */ + + try_next = FALSE; + break; + + case OP_ANYNL: + case OP_VSPACE: + SET_BIT(CHAR_LF); + SET_BIT(CHAR_VT); + SET_BIT(CHAR_FF); + SET_BIT(CHAR_CR); + + /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set + the bits for NEL and for code units >= 255, independently of UTF. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + SET_BIT(CHAR_NEL); + SET_BIT(0xFF); +#else + /* For the 8-bit library in UTF-8 mode, set the bits for the first code + units of vertical space characters. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + SET_BIT(0xC2); /* For U+0085 (NEL) */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else +#endif + /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ + { + SET_BIT(CHAR_NEL); + } +#endif /* 8-bit support */ + + try_next = FALSE; + break; + + /* Single character types set the bits and stop. Note that if PCRE2_UCP + is set, we do not see these opcodes because \d etc are converted to + properties. Therefore, these apply in the case when only characters less + than 256 are recognized to match the types. */ + + case OP_NOT_DIGIT: + set_nottype_bits(re, cbit_digit, table_limit); + try_next = FALSE; + break; + + case OP_DIGIT: + set_type_bits(re, cbit_digit, table_limit); + try_next = FALSE; + break; + + case OP_NOT_WHITESPACE: + set_nottype_bits(re, cbit_space, table_limit); + try_next = FALSE; + break; + + case OP_WHITESPACE: + set_type_bits(re, cbit_space, table_limit); + try_next = FALSE; + break; + + case OP_NOT_WORDCHAR: + set_nottype_bits(re, cbit_word, table_limit); + try_next = FALSE; + break; + + case OP_WORDCHAR: + set_type_bits(re, cbit_word, table_limit); + try_next = FALSE; + break; + + /* One or more character type fudges the pointer and restarts, knowing + it will hit a single character type and stop there. */ + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + tcode++; + break; + + case OP_TYPEEXACT: + tcode += 1 + IMM2_SIZE; + break; + + /* Zero or more repeats of character types set the bits and then + try again. */ + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + tcode += IMM2_SIZE; /* Fall through */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: + switch(tcode[1]) + { + default: + case OP_ANY: + case OP_ALLANY: + return SSB_FAIL; + + case OP_HSPACE: + SET_BIT(CHAR_HT); + SET_BIT(CHAR_SPACE); + + /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set + the bits for 0xA0 and for code units >= 255, independently of UTF. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + SET_BIT(0xA0); + SET_BIT(0xFF); +#else + /* For the 8-bit library in UTF-8 mode, set the bits for the first code + units of horizontal space characters. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + SET_BIT(0xC2); /* For U+00A0 */ + SET_BIT(0xE1); /* For U+1680, U+180E */ + SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ + SET_BIT(0xE3); /* For U+3000 */ + } + else +#endif + /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless + the code is EBCDIC. */ + { +#ifndef EBCDIC + SET_BIT(0xA0); +#endif /* Not EBCDIC */ + } +#endif /* 8-bit support */ + break; + + case OP_ANYNL: + case OP_VSPACE: + SET_BIT(CHAR_LF); + SET_BIT(CHAR_VT); + SET_BIT(CHAR_FF); + SET_BIT(CHAR_CR); + + /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set + the bits for NEL and for code units >= 255, independently of UTF. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + SET_BIT(CHAR_NEL); + SET_BIT(0xFF); +#else + /* For the 8-bit library in UTF-8 mode, set the bits for the first code + units of vertical space characters. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + SET_BIT(0xC2); /* For U+0085 (NEL) */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else +#endif + /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ + { + SET_BIT(CHAR_NEL); + } +#endif /* 8-bit support */ + break; + + case OP_NOT_DIGIT: + set_nottype_bits(re, cbit_digit, table_limit); + break; + + case OP_DIGIT: + set_type_bits(re, cbit_digit, table_limit); + break; + + case OP_NOT_WHITESPACE: + set_nottype_bits(re, cbit_space, table_limit); + break; + + case OP_WHITESPACE: + set_type_bits(re, cbit_space, table_limit); + break; + + case OP_NOT_WORDCHAR: + set_nottype_bits(re, cbit_word, table_limit); + break; + + case OP_WORDCHAR: + set_type_bits(re, cbit_word, table_limit); + break; + } + + tcode += 2; + break; + + /* Extended class: if there are any property checks, or if this is a + negative XCLASS without a map, give up. If there are no property checks, + there must be wide characters on the XCLASS list, because otherwise an + XCLASS would not have been created. This means that code points >= 255 + are potential starters. In the UTF-8 case we can scan them and set bits + for the relevant leading bytes. */ + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + xclassflags = tcode[1 + LINK_SIZE]; + if ((xclassflags & XCL_HASPROP) != 0 || + (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT) + return SSB_FAIL; + + /* We have a positive XCLASS or a negative one without a map. Set up the + map pointer if there is one, and fall through. */ + + classmap = ((xclassflags & XCL_MAP) == 0)? NULL : + (uint8_t *)(tcode + 1 + LINK_SIZE + 1); + + /* In UTF-8 mode, scan the character list and set bits for leading bytes, + then jump to handle the map. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf && (xclassflags & XCL_NOT) == 0) + { + PCRE2_UCHAR b, e; + PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32); + tcode += GET(tcode, 1); + + for (;;) switch (*p++) + { + case XCL_SINGLE: + b = *p++; + while ((*p & 0xc0) == 0x80) p++; + re->start_bitmap[b/8] |= (1u << (b&7)); + break; + + case XCL_RANGE: + b = *p++; + while ((*p & 0xc0) == 0x80) p++; + e = *p++; + while ((*p & 0xc0) == 0x80) p++; + for (; b <= e; b++) + re->start_bitmap[b/8] |= (1u << (b&7)); + break; + + case XCL_END: + goto HANDLE_CLASSMAP; + + default: + return SSB_UNKNOWN; /* Internal error, should not occur */ + } + } +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ +#endif /* SUPPORT_WIDE_CHARS */ + + /* It seems that the fall through comment must be outside the #ifdef if + it is to avoid the gcc compiler warning. */ + + /* Fall through */ + + /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are + in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter + because it starts a character with a value > 255. In 8-bit non-UTF mode, + there is no difference between CLASS and NCLASS. In all other wide + character modes, set the 0xFF bit to indicate code units >= 255. */ + + case OP_NCLASS: +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (utf) + { + re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ + memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ + } +#elif PCRE2_CODE_UNIT_WIDTH != 8 + SET_BIT(0xFF); /* For characters >= 255 */ +#endif + /* Fall through */ + + /* Enter here for a positive non-XCLASS. If we have fallen through from + an XCLASS, classmap will already be set; just advance the code pointer. + Otherwise, set up classmap for a a non-XCLASS and advance past it. */ + + case OP_CLASS: + if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else + { + classmap = (uint8_t *)(++tcode); + tcode += 32 / sizeof(PCRE2_UCHAR); + } + + /* When wide characters are supported, classmap may be NULL. In UTF-8 + (sic) mode, the bits in a class bit map correspond to character values, + not to byte values. However, the bit map we are constructing is for byte + values. So we have to do a conversion for characters whose code point is + greater than 127. In fact, there are only two possible starting bytes for + characters in the range 128 - 255. */ + +#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8 + HANDLE_CLASSMAP: +#endif + if (classmap != NULL) + { +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (utf) + { + for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c]; + for (c = 128; c < 256; c++) + { + if ((classmap[c/8] & (1u << (c&7))) != 0) + { + int d = (c >> 6) | 0xc0; /* Set bit for this starter */ + re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */ + c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ + } + } + } + else +#endif + /* In all modes except UTF-8, the two bit maps are compatible. */ + + { + for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c]; + } + } + + /* Act on what follows the class. For a zero minimum repeat, continue; + otherwise stop processing. */ + + switch (*tcode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: + tcode++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; + else try_next = FALSE; + break; + + default: + try_next = FALSE; + break; + } + break; /* End of class handling case */ + } /* End of switch for opcodes */ + } /* End of try_next loop */ + + code += GET(code, 1); /* Advance to next branch */ + } +while (*code == OP_ALT); + +return yield; +} + + + +/************************************************* +* Study a compiled expression * +*************************************************/ + +/* This function is handed a compiled expression that it must study to produce +information that will speed up the matching. + +Argument: + re points to the compiled expression + +Returns: 0 normally; non-zero should never normally occur + 1 unknown opcode in set_start_bits + 2 missing capturing bracket + 3 unknown opcode in find_minlength +*/ + +int +PRIV(study)(pcre2_real_code *re) +{ +int count = 0; +PCRE2_UCHAR *code; +BOOL utf = (re->overall_options & PCRE2_UTF) != 0; +BOOL ucp = (re->overall_options & PCRE2_UCP) != 0; + +/* Find start of compiled code */ + +code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + + re->name_entry_size * re->name_count; + +/* For a pattern that has a first code unit, or a multiline pattern that +matches only at "line start", there is no point in seeking a list of starting +code units. */ + +if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) + { + int depth = 0; + int rc = set_start_bits(re, code, utf, ucp, &depth); + if (rc == SSB_UNKNOWN) return 1; + + /* If a list of starting code units was set up, scan the list to see if only + one or two were listed. Having only one listed is rare because usually a + single starting code unit will have been recognized and PCRE2_FIRSTSET set. + If two are listed, see if they are caseless versions of the same character; + if so we can replace the list with a caseless first code unit. This gives + better performance and is plausibly worth doing for patterns such as [Ww]ord + or (word|WORD). */ + + if (rc == SSB_DONE) + { + int i; + int a = -1; + int b = -1; + uint8_t *p = re->start_bitmap; + uint32_t flags = PCRE2_FIRSTMAPSET; + + for (i = 0; i < 256; p++, i += 8) + { + uint8_t x = *p; + if (x != 0) + { + int c; + uint8_t y = x & (~x + 1); /* Least significant bit */ + if (y != x) goto DONE; /* More than one bit set */ + + /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and + all wide characters", so we cannot use it here. */ + +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (i == 248 && x == 0x80) goto DONE; +#endif + + /* Compute the character value */ + + c = i; + switch (x) + { + case 1: break; + case 2: c += 1; break; case 4: c += 2; break; + case 8: c += 3; break; case 16: c += 4; break; + case 32: c += 5; break; case 64: c += 6; break; + case 128: c += 7; break; + } + + /* c contains the code unit value, in the range 0-255. In 8-bit UTF + mode, only values < 128 can be used. In all the other cases, c is a + character value. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf && c > 127) goto DONE; +#endif + if (a < 0) a = c; /* First one found, save in a */ + else if (b < 0) /* Second one found */ + { + int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c); + +#ifdef SUPPORT_UNICODE + if (utf || ucp) + { + if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */ + if (c > 127) d = UCD_OTHERCASE(c); + } +#endif /* SUPPORT_UNICODE */ + + if (d != a) goto DONE; /* Not the other case of a */ + b = c; /* Save second in b */ + } + else goto DONE; /* More than two characters found */ + } + } + + /* Replace the start code unit bits with a first code unit, but only if it + is not the same as a required later code unit. This is because a search for + a required code unit starts after an explicit first code unit, but at a + code unit found from the bitmap. Patterns such as /a*a/ don't work + if both the start unit and required unit are the same. */ + + if (a >= 0 && + ( + (re->flags & PCRE2_LASTSET) == 0 || + ( + re->last_codeunit != (uint32_t)a && + (b < 0 || re->last_codeunit != (uint32_t)b) + ) + )) + { + re->first_codeunit = a; + flags = PCRE2_FIRSTSET; + if (b >= 0) flags |= PCRE2_FIRSTCASELESS; + } + + DONE: + re->flags |= flags; + } + } + +/* Find the minimum length of subject string. If the pattern can match an empty +string, the minimum length is already known. If the pattern contains (*ACCEPT) +all bets are off, and we don't even try to find a minimum length. If there are +more back references than the size of the vector we are going to cache them in, +do nothing. A pattern that complicated will probably take a long time to +analyze and may in any case turn out to be too complicated. Note that back +reference minima are held as 16-bit numbers. */ + +if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 && + re->top_backref <= MAX_CACHE_BACKREF) + { + int min; + int backref_cache[MAX_CACHE_BACKREF+1]; + backref_cache[0] = 0; /* Highest one that is set */ + min = find_minlength(re, code, code, utf, NULL, &count, backref_cache); + switch(min) + { + case -1: /* \C in UTF mode or over-complex regex */ + break; /* Leave minlength unchanged (will be zero) */ + + case -2: + return 2; /* missing capturing bracket */ + + case -3: + return 3; /* unrecognized opcode */ + + default: + re->minlength = (min > UINT16_MAX)? UINT16_MAX : min; + break; + } + } + +return 0; +} + +/* End of pcre2_study.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_substitute.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_substitute.c new file mode 100644 index 0000000000..56c8d965e4 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_substitute.c @@ -0,0 +1,1009 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + +#define PTR_STACK_SIZE 20 + +#define SUBSTITUTE_OPTIONS \ + (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ + PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ + PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) + + + +/************************************************* +* Find end of substitute text * +*************************************************/ + +/* In extended mode, we recognize ${name:+set text:unset text} and similar +constructions. This requires the identification of unescaped : and } +characters. This function scans for such. It must deal with nested ${ +constructions. The pointer to the text is updated, either to the required end +character, or to where an error was detected. + +Arguments: + code points to the compiled expression (for options) + ptrptr points to the pointer to the start of the text (updated) + ptrend end of the whole string + last TRUE if the last expected string (only } recognized) + +Returns: 0 on success + negative error code on failure +*/ + +static int +find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, + BOOL last) +{ +int rc = 0; +uint32_t nestlevel = 0; +BOOL literal = FALSE; +PCRE2_SPTR ptr = *ptrptr; + +for (; ptr < ptrend; ptr++) + { + if (literal) + { + if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) + { + literal = FALSE; + ptr += 1; + } + } + + else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) + { + if (nestlevel == 0) goto EXIT; + nestlevel--; + } + + else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; + + else if (*ptr == CHAR_DOLLAR_SIGN) + { + if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) + { + nestlevel++; + ptr += 1; + } + } + + else if (*ptr == CHAR_BACKSLASH) + { + int erc; + int errorcode; + uint32_t ch; + + if (ptr < ptrend - 1) switch (ptr[1]) + { + case CHAR_L: + case CHAR_l: + case CHAR_U: + case CHAR_u: + ptr += 1; + continue; + } + + ptr += 1; /* Must point after \ */ + erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, + code->overall_options, code->extra_options, FALSE, NULL); + ptr -= 1; /* Back to last code unit of escape */ + if (errorcode != 0) + { + rc = errorcode; + goto EXIT; + } + + switch(erc) + { + case 0: /* Data character */ + case ESC_E: /* Isolated \E is ignored */ + break; + + case ESC_Q: + literal = TRUE; + break; + + default: + rc = PCRE2_ERROR_BADREPESCAPE; + goto EXIT; + } + } + } + +rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ + +EXIT: +*ptrptr = ptr; +return rc; +} + + + +/************************************************* +* Match and substitute * +*************************************************/ + +/* This function applies a compiled re to a subject string and creates a new +string with substitutions. The first 7 arguments are the same as for +pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. + +Arguments: + code points to the compiled expression + subject points to the subject string + length length of subject string (may contain binary zeros) + start_offset where to start in the subject string + options option bits + match_data points to a match_data block, or is NULL + context points a PCRE2 context + replacement points to the replacement string + rlength length of replacement string + buffer where to put the substituted string + blength points to length of buffer; updated to length of string + +Returns: >= 0 number of substitutions made + < 0 an error code + PCRE2_ERROR_BADREPLACEMENT means invalid use of $ +*/ + +/* This macro checks for space in the buffer before copying into it. On +overflow, either give an error immediately, or keep on, accumulating the +length. */ + +#define CHECKMEMCPY(from,length) \ + { \ + if (!overflowed && lengthleft < length) \ + { \ + if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ + overflowed = TRUE; \ + extra_needed = length - lengthleft; \ + } \ + else if (overflowed) \ + { \ + extra_needed += length; \ + } \ + else \ + { \ + memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ + buff_offset += length; \ + lengthleft -= length; \ + } \ + } + +/* Here's the function */ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, + PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, + PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) +{ +int rc; +int subs; +int forcecase = 0; +int forcecasereset = 0; +uint32_t ovector_count; +uint32_t goptions = 0; +uint32_t suboptions; +pcre2_match_data *internal_match_data = NULL; +BOOL escaped_literal = FALSE; +BOOL overflowed = FALSE; +BOOL use_existing_match; +BOOL replacement_only; +#ifdef SUPPORT_UNICODE +BOOL utf = (code->overall_options & PCRE2_UTF) != 0; +BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; +#endif +PCRE2_UCHAR temp[6]; +PCRE2_SPTR ptr; +PCRE2_SPTR repend; +PCRE2_SIZE extra_needed = 0; +PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; +PCRE2_SIZE *ovector; +PCRE2_SIZE ovecsave[3]; +pcre2_substitute_callout_block scb; + +/* General initialization */ + +buff_offset = 0; +lengthleft = buff_length = *blength; +*blength = PCRE2_UNSET; +ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; + +/* Partial matching is not valid. This must come after setting *blength to +PCRE2_UNSET, so as not to imply an offset in the replacement. */ + +if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) + return PCRE2_ERROR_BADOPTION; + +/* Validate length and find the end of the replacement. A NULL replacement of +zero length is interpreted as an empty string. */ + +if (replacement == NULL) + { + if (rlength != 0) return PCRE2_ERROR_NULL; + replacement = (PCRE2_SPTR)""; + } + +if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); +repend = replacement + rlength; + +/* Check for using a match that has already happened. Note that the subject +pointer in the match data may be NULL after a no-match. */ + +use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); +replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); + +/* If starting from an existing match, there must be an externally provided +match data block. We create an internal match_data block in two cases: (a) an +external one is not supplied (and we are not starting from an existing match); +(b) an existing match is to be used for the first substitution. In the latter +case, we copy the existing match into the internal block, except for any cached +heap frame size and pointer. This ensures that no changes are made to the +external match data block. */ + +if (match_data == NULL) + { + pcre2_general_context *gcontext; + if (use_existing_match) return PCRE2_ERROR_NULL; + gcontext = (mcontext == NULL)? + (pcre2_general_context *)code : + (pcre2_general_context *)mcontext; + match_data = internal_match_data = + pcre2_match_data_create_from_pattern(code, gcontext); + if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; + } + +else if (use_existing_match) + { + pcre2_general_context *gcontext = (mcontext == NULL)? + (pcre2_general_context *)code : + (pcre2_general_context *)mcontext; + int pairs = (code->top_bracket + 1 < match_data->oveccount)? + code->top_bracket + 1 : match_data->oveccount; + internal_match_data = pcre2_match_data_create(match_data->oveccount, + gcontext); + if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) + + 2*pairs*sizeof(PCRE2_SIZE)); + internal_match_data->heapframes = NULL; + internal_match_data->heapframes_size = 0; + match_data = internal_match_data; + } + +/* Remember ovector details */ + +ovector = pcre2_get_ovector_pointer(match_data); +ovector_count = pcre2_get_ovector_count(match_data); + +/* Fixed things in the callout block */ + +scb.version = 0; +scb.input = subject; +scb.output = (PCRE2_SPTR)buffer; +scb.ovector = ovector; + +/* A NULL subject of zero length is treated as an empty string. */ + +if (subject == NULL) + { + if (length != 0) return PCRE2_ERROR_NULL; + subject = (PCRE2_SPTR)""; + } + +/* Find length of zero-terminated subject */ + +if (length == PCRE2_ZERO_TERMINATED) + length = subject? PRIV(strlen)(subject) : 0; + +/* Check UTF replacement string if necessary. */ + +#ifdef SUPPORT_UNICODE +if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) + { + rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); + if (rc != 0) + { + match_data->leftchar = 0; + goto EXIT; + } + } +#endif /* SUPPORT_UNICODE */ + +/* Save the substitute options and remove them from the match options. */ + +suboptions = options & SUBSTITUTE_OPTIONS; +options &= ~SUBSTITUTE_OPTIONS; + +/* Error if the start match offset is greater than the length of the subject. */ + +if (start_offset > length) + { + match_data->leftchar = 0; + rc = PCRE2_ERROR_BADOFFSET; + goto EXIT; + } + +/* Copy up to the start offset, unless only the replacement is required. */ + +if (!replacement_only) CHECKMEMCPY(subject, start_offset); + +/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first +match is taken from the match_data that was passed in. */ + +subs = 0; +do + { + PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; + uint32_t ptrstackptr = 0; + + if (use_existing_match) + { + rc = match_data->rc; + use_existing_match = FALSE; + } + else rc = pcre2_match(code, subject, length, start_offset, options|goptions, + match_data, mcontext); + +#ifdef SUPPORT_UNICODE + if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ +#endif + + /* Any error other than no match returns the error code. No match when not + doing the special after-empty-match global rematch, or when at the end of the + subject, breaks the global loop. Otherwise, advance the starting point by one + character, copying it to the output, and try again. */ + + if (rc < 0) + { + PCRE2_SIZE save_start; + + if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; + if (goptions == 0 || start_offset >= length) break; + + /* Advance by one code point. Then, if CRLF is a valid newline sequence and + we have advanced into the middle of it, advance one more code point. In + other words, do not start in the middle of CRLF, even if CR and LF on their + own are valid newlines. */ + + save_start = start_offset++; + if (subject[start_offset-1] == CHAR_CR && + code->newline_convention != PCRE2_NEWLINE_CR && + code->newline_convention != PCRE2_NEWLINE_LF && + start_offset < length && + subject[start_offset] == CHAR_LF) + start_offset++; + + /* Otherwise, in UTF mode, advance past any secondary code points. */ + + else if ((code->overall_options & PCRE2_UTF) != 0) + { +#if PCRE2_CODE_UNIT_WIDTH == 8 + while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) + start_offset++; +#elif PCRE2_CODE_UNIT_WIDTH == 16 + while (start_offset < length && + (subject[start_offset] & 0xfc00) == 0xdc00) + start_offset++; +#endif + } + + /* Copy what we have advanced past (unless not required), reset the special + global options, and continue to the next match. */ + + fraglength = start_offset - save_start; + if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); + goptions = 0; + continue; + } + + /* Handle a successful match. Matches that use \K to end before they start + or start before the current point in the subject are not supported. */ + + if (ovector[1] < ovector[0] || ovector[0] < start_offset) + { + rc = PCRE2_ERROR_BADSUBSPATTERN; + goto EXIT; + } + + /* Check for the same match as previous. This is legitimate after matching an + empty string that starts after the initial match offset. We have tried again + at the match point in case the pattern is one like /(?<=\G.)/ which can never + match at its starting point, so running the match achieves the bumpalong. If + we do get the same (null) match at the original match point, it isn't such a + pattern, so we now do the empty string magic. In all other cases, a repeat + match should never occur. */ + + if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) + { + if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) + { + goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + ovecsave[2] = start_offset; + continue; /* Back to the top of the loop */ + } + rc = PCRE2_ERROR_INTERNAL_DUPMATCH; + goto EXIT; + } + + /* Count substitutions with a paranoid check for integer overflow; surely no + real call to this function would ever hit this! */ + + if (subs == INT_MAX) + { + rc = PCRE2_ERROR_TOOMANYREPLACE; + goto EXIT; + } + subs++; + + /* Copy the text leading up to the match (unless not required), and remember + where the insert begins and how many ovector pairs are set. */ + + if (rc == 0) rc = ovector_count; + fraglength = ovector[0] - start_offset; + if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); + scb.output_offsets[0] = buff_offset; + scb.oveccount = rc; + + /* Process the replacement string. If the entire replacement is literal, just + copy it with length check. */ + + ptr = replacement; + if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) + { + CHECKMEMCPY(ptr, rlength); + } + + /* Within a non-literal replacement, which must be scanned character by + character, local literal mode can be set by \Q, but only in extended mode + when backslashes are being interpreted. In extended mode we must handle + nested substrings that are to be reprocessed. */ + + else for (;;) + { + uint32_t ch; + unsigned int chlen; + + /* If at the end of a nested substring, pop the stack. */ + + if (ptr >= repend) + { + if (ptrstackptr == 0) break; /* End of replacement string */ + repend = ptrstack[--ptrstackptr]; + ptr = ptrstack[--ptrstackptr]; + continue; + } + + /* Handle the next character */ + + if (escaped_literal) + { + if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) + { + escaped_literal = FALSE; + ptr += 2; + continue; + } + goto LOADLITERAL; + } + + /* Not in literal mode. */ + + if (*ptr == CHAR_DOLLAR_SIGN) + { + int group, n; + uint32_t special = 0; + BOOL inparens; + BOOL star; + PCRE2_SIZE sublength; + PCRE2_SPTR text1_start = NULL; + PCRE2_SPTR text1_end = NULL; + PCRE2_SPTR text2_start = NULL; + PCRE2_SPTR text2_end = NULL; + PCRE2_UCHAR next; + PCRE2_UCHAR name[33]; + + if (++ptr >= repend) goto BAD; + if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; + + group = -1; + n = 0; + inparens = FALSE; + star = FALSE; + + if (next == CHAR_LEFT_CURLY_BRACKET) + { + if (++ptr >= repend) goto BAD; + next = *ptr; + inparens = TRUE; + } + + if (next == CHAR_ASTERISK) + { + if (++ptr >= repend) goto BAD; + next = *ptr; + star = TRUE; + } + + if (!star && next >= CHAR_0 && next <= CHAR_9) + { + group = next - CHAR_0; + while (++ptr < repend) + { + next = *ptr; + if (next < CHAR_0 || next > CHAR_9) break; + group = group * 10 + next - CHAR_0; + + /* A check for a number greater than the hightest captured group + is sufficient here; no need for a separate overflow check. If unknown + groups are to be treated as unset, just skip over any remaining + digits and carry on. */ + + if (group > code->top_bracket) + { + if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) + { + while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); + break; + } + else + { + rc = PCRE2_ERROR_NOSUBSTRING; + goto PTREXIT; + } + } + } + } + else + { + const uint8_t *ctypes = code->tables + ctypes_offset; + while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) + { + name[n++] = next; + if (n > 32) goto BAD; + if (++ptr >= repend) break; + next = *ptr; + } + if (n == 0) goto BAD; + name[n] = 0; + } + + /* In extended mode we recognize ${name:+set text:unset text} and + ${name:-default text}. */ + + if (inparens) + { + if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && + !star && ptr < repend - 2 && next == CHAR_COLON) + { + special = *(++ptr); + if (special != CHAR_PLUS && special != CHAR_MINUS) + { + rc = PCRE2_ERROR_BADSUBSTITUTION; + goto PTREXIT; + } + + text1_start = ++ptr; + rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); + if (rc != 0) goto PTREXIT; + text1_end = ptr; + + if (special == CHAR_PLUS && *ptr == CHAR_COLON) + { + text2_start = ++ptr; + rc = find_text_end(code, &ptr, repend, TRUE); + if (rc != 0) goto PTREXIT; + text2_end = ptr; + } + } + + else + { + if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) + { + rc = PCRE2_ERROR_REPMISSINGBRACE; + goto PTREXIT; + } + } + + ptr++; + } + + /* Have found a syntactically correct group number or name, or *name. + Only *MARK is currently recognized. */ + + if (star) + { + if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) + { + PCRE2_SPTR mark = pcre2_get_mark(match_data); + if (mark != NULL) + { + PCRE2_SPTR mark_start = mark; + while (*mark != 0) mark++; + fraglength = mark - mark_start; + CHECKMEMCPY(mark_start, fraglength); + } + } + else goto BAD; + } + + /* Substitute the contents of a group. We don't use substring_copy + functions any more, in order to support case forcing. */ + + else + { + PCRE2_SPTR subptr, subptrend; + + /* Find a number for a named group. In case there are duplicate names, + search for the first one that is set. If the name is not found when + PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a + non-existent group. */ + + if (group < 0) + { + PCRE2_SPTR first, last, entry; + rc = pcre2_substring_nametable_scan(code, name, &first, &last); + if (rc == PCRE2_ERROR_NOSUBSTRING && + (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) + { + group = code->top_bracket + 1; + } + else + { + if (rc < 0) goto PTREXIT; + for (entry = first; entry <= last; entry += rc) + { + uint32_t ng = GET2(entry, 0); + if (ng < ovector_count) + { + if (group < 0) group = ng; /* First in ovector */ + if (ovector[ng*2] != PCRE2_UNSET) + { + group = ng; /* First that is set */ + break; + } + } + } + + /* If group is still negative, it means we did not find a group + that is in the ovector. Just set the first group. */ + + if (group < 0) group = GET2(first, 0); + } + } + + /* We now have a group that is identified by number. Find the length of + the captured string. If a group in a non-special substitution is unset + when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ + + rc = pcre2_substring_length_bynumber(match_data, group, &sublength); + if (rc < 0) + { + if (rc == PCRE2_ERROR_NOSUBSTRING && + (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) + { + rc = PCRE2_ERROR_UNSET; + } + if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ + if (special == 0) /* Plain substitution */ + { + if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; + goto PTREXIT; /* Else error */ + } + } + + /* If special is '+' we have a 'set' and possibly an 'unset' text, + both of which are reprocessed when used. If special is '-' we have a + default text for when the group is unset; it must be reprocessed. */ + + if (special != 0) + { + if (special == CHAR_MINUS) + { + if (rc == 0) goto LITERAL_SUBSTITUTE; + text2_start = text1_start; + text2_end = text1_end; + } + + if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; + ptrstack[ptrstackptr++] = ptr; + ptrstack[ptrstackptr++] = repend; + + if (rc == 0) + { + ptr = text1_start; + repend = text1_end; + } + else + { + ptr = text2_start; + repend = text2_end; + } + continue; + } + + /* Otherwise we have a literal substitution of a group's contents. */ + + LITERAL_SUBSTITUTE: + subptr = subject + ovector[group*2]; + subptrend = subject + ovector[group*2 + 1]; + + /* Substitute a literal string, possibly forcing alphabetic case. */ + + while (subptr < subptrend) + { + GETCHARINCTEST(ch, subptr); + if (forcecase != 0) + { +#ifdef SUPPORT_UNICODE + if (utf || ucp) + { + uint32_t type = UCD_CHARTYPE(ch); + if (PRIV(ucp_gentype)[type] == ucp_L && + type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) + ch = UCD_OTHERCASE(ch); + } + else +#endif + { + if (((code->tables + cbits_offset + + ((forcecase > 0)? cbit_upper:cbit_lower) + )[ch/8] & (1u << (ch%8))) == 0) + ch = (code->tables + fcc_offset)[ch]; + } + forcecase = forcecasereset; + } + +#ifdef SUPPORT_UNICODE + if (utf) chlen = PRIV(ord2utf)(ch, temp); else +#endif + { + temp[0] = ch; + chlen = 1; + } + CHECKMEMCPY(temp, chlen); + } + } + } + + /* Handle an escape sequence in extended mode. We can use check_escape() + to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but + the case-forcing escapes are not supported in pcre2_compile() so must be + recognized here. */ + + else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && + *ptr == CHAR_BACKSLASH) + { + int errorcode; + + if (ptr < repend - 1) switch (ptr[1]) + { + case CHAR_L: + forcecase = forcecasereset = -1; + ptr += 2; + continue; + + case CHAR_l: + forcecase = -1; + forcecasereset = 0; + ptr += 2; + continue; + + case CHAR_U: + forcecase = forcecasereset = 1; + ptr += 2; + continue; + + case CHAR_u: + forcecase = 1; + forcecasereset = 0; + ptr += 2; + continue; + + default: + break; + } + + ptr++; /* Point after \ */ + rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, + code->overall_options, code->extra_options, FALSE, NULL); + if (errorcode != 0) goto BADESCAPE; + + switch(rc) + { + case ESC_E: + forcecase = forcecasereset = 0; + continue; + + case ESC_Q: + escaped_literal = TRUE; + continue; + + case 0: /* Data character */ + goto LITERAL; + + default: + goto BADESCAPE; + } + } + + /* Handle a literal code unit */ + + else + { + LOADLITERAL: + GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ + + LITERAL: + if (forcecase != 0) + { +#ifdef SUPPORT_UNICODE + if (utf || ucp) + { + uint32_t type = UCD_CHARTYPE(ch); + if (PRIV(ucp_gentype)[type] == ucp_L && + type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) + ch = UCD_OTHERCASE(ch); + } + else +#endif + { + if (((code->tables + cbits_offset + + ((forcecase > 0)? cbit_upper:cbit_lower) + )[ch/8] & (1u << (ch%8))) == 0) + ch = (code->tables + fcc_offset)[ch]; + } + forcecase = forcecasereset; + } + +#ifdef SUPPORT_UNICODE + if (utf) chlen = PRIV(ord2utf)(ch, temp); else +#endif + { + temp[0] = ch; + chlen = 1; + } + CHECKMEMCPY(temp, chlen); + } /* End handling a literal code unit */ + } /* End of loop for scanning the replacement. */ + + /* The replacement has been copied to the output, or its size has been + remembered. Do the callout if there is one and we have done an actual + replacement. */ + + if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) + { + scb.subscount = subs; + scb.output_offsets[1] = buff_offset; + rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); + + /* A non-zero return means cancel this substitution. Instead, copy the + matched string fragment. */ + + if (rc != 0) + { + PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; + PCRE2_SIZE oldlength = ovector[1] - ovector[0]; + + buff_offset -= newlength; + lengthleft += newlength; + if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); + + /* A negative return means do not do any more. */ + + if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); + } + } + + /* Save the details of this match. See above for how this data is used. If we + matched an empty string, do the magic for global matches. Update the start + offset to point to the rest of the subject string. If we re-used an existing + match for the first match, switch to the internal match data block. */ + + ovecsave[0] = ovector[0]; + ovecsave[1] = ovector[1]; + ovecsave[2] = start_offset; + + goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : + PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; + start_offset = ovector[1]; + } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ + +/* Copy the rest of the subject unless not required, and terminate the output +with a binary zero. */ + +if (!replacement_only) + { + fraglength = length - start_offset; + CHECKMEMCPY(subject + start_offset, fraglength); + } + +temp[0] = 0; +CHECKMEMCPY(temp, 1); + +/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, +and matching has carried on after a full buffer, in order to compute the length +needed. Otherwise, an overflow generates an immediate error return. */ + +if (overflowed) + { + rc = PCRE2_ERROR_NOMEMORY; + *blength = buff_length + extra_needed; + } + +/* After a successful execution, return the number of substitutions and set the +length of buffer used, excluding the trailing zero. */ + +else + { + rc = subs; + *blength = buff_offset - 1; + } + +EXIT: +if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); + else match_data->rc = rc; +return rc; + +NOROOM: +rc = PCRE2_ERROR_NOMEMORY; +goto EXIT; + +BAD: +rc = PCRE2_ERROR_BADREPLACEMENT; +goto PTREXIT; + +BADESCAPE: +rc = PCRE2_ERROR_BADREPESCAPE; + +PTREXIT: +*blength = (PCRE2_SIZE)(ptr - replacement); +goto EXIT; +} + +/* End of pcre2_substitute.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_substring.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_substring.c new file mode 100644 index 0000000000..ef82f144c6 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_substring.c @@ -0,0 +1,547 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2018 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + +#include "regexp/pcre2/pcre2_internal.h" + + + +/************************************************* +* Copy named captured string to given buffer * +*************************************************/ + +/* This function copies a single captured substring into a given buffer, +identifying it by name. If the regex permits duplicate names, the first +substring that is set is chosen. + +Arguments: + match_data points to the match data + stringname the name of the required substring + buffer where to put the substring + sizeptr the size of the buffer, updated to the size of the substring + +Returns: if successful: zero + if not successful, a negative error code: + (1) an error from nametable_scan() + (2) an error from copy_bynumber() + (3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector + (4) PCRE2_ERROR_UNSET: all named groups in ovector are unset +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname, + PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr) +{ +PCRE2_SPTR first, last, entry; +int failrc, entrysize; +if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER) + return PCRE2_ERROR_DFA_UFUNC; +entrysize = pcre2_substring_nametable_scan(match_data->code, stringname, + &first, &last); +if (entrysize < 0) return entrysize; +failrc = PCRE2_ERROR_UNAVAILABLE; +for (entry = first; entry <= last; entry += entrysize) + { + uint32_t n = GET2(entry, 0); + if (n < match_data->oveccount) + { + if (match_data->ovector[n*2] != PCRE2_UNSET) + return pcre2_substring_copy_bynumber(match_data, n, buffer, sizeptr); + failrc = PCRE2_ERROR_UNSET; + } + } +return failrc; +} + + + +/************************************************* +* Copy numbered captured string to given buffer * +*************************************************/ + +/* This function copies a single captured substring into a given buffer, +identifying it by number. + +Arguments: + match_data points to the match data + stringnumber the number of the required substring + buffer where to put the substring + sizeptr the size of the buffer, updated to the size of the substring + +Returns: if successful: 0 + if not successful, a negative error code: + PCRE2_ERROR_NOMEMORY: buffer too small + PCRE2_ERROR_NOSUBSTRING: no such substring + PCRE2_ERROR_UNAVAILABLE: ovector too small + PCRE2_ERROR_UNSET: substring is not set +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_copy_bynumber(pcre2_match_data *match_data, + uint32_t stringnumber, PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr) +{ +int rc; +PCRE2_SIZE size; +rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size); +if (rc < 0) return rc; +if (size + 1 > *sizeptr) return PCRE2_ERROR_NOMEMORY; +memcpy(buffer, match_data->subject + match_data->ovector[stringnumber*2], + CU2BYTES(size)); +buffer[size] = 0; +*sizeptr = size; +return 0; +} + + + +/************************************************* +* Extract named captured string * +*************************************************/ + +/* This function copies a single captured substring, identified by name, into +new memory. If the regex permits duplicate names, the first substring that is +set is chosen. + +Arguments: + match_data pointer to match_data + stringname the name of the required substring + stringptr where to put the pointer to the new memory + sizeptr where to put the length of the substring + +Returns: if successful: zero + if not successful, a negative value: + (1) an error from nametable_scan() + (2) an error from get_bynumber() + (3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector + (4) PCRE2_ERROR_UNSET: all named groups in ovector are unset +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_get_byname(pcre2_match_data *match_data, + PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr) +{ +PCRE2_SPTR first, last, entry; +int failrc, entrysize; +if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER) + return PCRE2_ERROR_DFA_UFUNC; +entrysize = pcre2_substring_nametable_scan(match_data->code, stringname, + &first, &last); +if (entrysize < 0) return entrysize; +failrc = PCRE2_ERROR_UNAVAILABLE; +for (entry = first; entry <= last; entry += entrysize) + { + uint32_t n = GET2(entry, 0); + if (n < match_data->oveccount) + { + if (match_data->ovector[n*2] != PCRE2_UNSET) + return pcre2_substring_get_bynumber(match_data, n, stringptr, sizeptr); + failrc = PCRE2_ERROR_UNSET; + } + } +return failrc; +} + + + +/************************************************* +* Extract captured string to new memory * +*************************************************/ + +/* This function copies a single captured substring into a piece of new +memory. + +Arguments: + match_data points to match data + stringnumber the number of the required substring + stringptr where to put a pointer to the new memory + sizeptr where to put the size of the substring + +Returns: if successful: 0 + if not successful, a negative error code: + PCRE2_ERROR_NOMEMORY: failed to get memory + PCRE2_ERROR_NOSUBSTRING: no such substring + PCRE2_ERROR_UNAVAILABLE: ovector too small + PCRE2_ERROR_UNSET: substring is not set +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_get_bynumber(pcre2_match_data *match_data, + uint32_t stringnumber, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr) +{ +int rc; +PCRE2_SIZE size; +PCRE2_UCHAR *yield; +rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size); +if (rc < 0) return rc; +yield = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + + (size + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)match_data); +if (yield == NULL) return PCRE2_ERROR_NOMEMORY; +yield = (PCRE2_UCHAR *)(((char *)yield) + sizeof(pcre2_memctl)); +memcpy(yield, match_data->subject + match_data->ovector[stringnumber*2], + CU2BYTES(size)); +yield[size] = 0; +*stringptr = yield; +*sizeptr = size; +return 0; +} + + + +/************************************************* +* Free memory obtained by get_substring * +*************************************************/ + +/* +Argument: the result of a previous pcre2_substring_get_byxxx() +Returns: nothing +*/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_substring_free(PCRE2_UCHAR *string) +{ +if (string != NULL) + { + pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl)); + memctl->free(memctl, memctl->memory_data); + } +} + + + +/************************************************* +* Get length of a named substring * +*************************************************/ + +/* This function returns the length of a named captured substring. If the regex +permits duplicate names, the first substring that is set is chosen. + +Arguments: + match_data pointer to match data + stringname the name of the required substring + sizeptr where to put the length + +Returns: 0 if successful, else a negative error number +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_length_byname(pcre2_match_data *match_data, + PCRE2_SPTR stringname, PCRE2_SIZE *sizeptr) +{ +PCRE2_SPTR first, last, entry; +int failrc, entrysize; +if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER) + return PCRE2_ERROR_DFA_UFUNC; +entrysize = pcre2_substring_nametable_scan(match_data->code, stringname, + &first, &last); +if (entrysize < 0) return entrysize; +failrc = PCRE2_ERROR_UNAVAILABLE; +for (entry = first; entry <= last; entry += entrysize) + { + uint32_t n = GET2(entry, 0); + if (n < match_data->oveccount) + { + if (match_data->ovector[n*2] != PCRE2_UNSET) + return pcre2_substring_length_bynumber(match_data, n, sizeptr); + failrc = PCRE2_ERROR_UNSET; + } + } +return failrc; +} + + + +/************************************************* +* Get length of a numbered substring * +*************************************************/ + +/* This function returns the length of a captured substring. If the start is +beyond the end (which can happen when \K is used in an assertion), it sets the +length to zero. + +Arguments: + match_data pointer to match data + stringnumber the number of the required substring + sizeptr where to put the length, if not NULL + +Returns: if successful: 0 + if not successful, a negative error code: + PCRE2_ERROR_NOSUBSTRING: no such substring + PCRE2_ERROR_UNAVAILABLE: ovector is too small + PCRE2_ERROR_UNSET: substring is not set +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_length_bynumber(pcre2_match_data *match_data, + uint32_t stringnumber, PCRE2_SIZE *sizeptr) +{ +PCRE2_SIZE left, right; +int count = match_data->rc; +if (count == PCRE2_ERROR_PARTIAL) + { + if (stringnumber > 0) return PCRE2_ERROR_PARTIAL; + count = 0; + } +else if (count < 0) return count; /* Match failed */ + +if (match_data->matchedby != PCRE2_MATCHEDBY_DFA_INTERPRETER) + { + if (stringnumber > match_data->code->top_bracket) + return PCRE2_ERROR_NOSUBSTRING; + if (stringnumber >= match_data->oveccount) + return PCRE2_ERROR_UNAVAILABLE; + if (match_data->ovector[stringnumber*2] == PCRE2_UNSET) + return PCRE2_ERROR_UNSET; + } +else /* Matched using pcre2_dfa_match() */ + { + if (stringnumber >= match_data->oveccount) return PCRE2_ERROR_UNAVAILABLE; + if (count != 0 && stringnumber >= (uint32_t)count) return PCRE2_ERROR_UNSET; + } + +left = match_data->ovector[stringnumber*2]; +right = match_data->ovector[stringnumber*2+1]; +if (sizeptr != NULL) *sizeptr = (left > right)? 0 : right - left; +return 0; +} + + + +/************************************************* +* Extract all captured strings to new memory * +*************************************************/ + +/* This function gets one chunk of memory and builds a list of pointers and all +the captured substrings in it. A NULL pointer is put on the end of the list. +The substrings are zero-terminated, but also, if the final argument is +non-NULL, a list of lengths is also returned. This allows binary data to be +handled. + +Arguments: + match_data points to the match data + listptr set to point to the list of pointers + lengthsptr set to point to the list of lengths (may be NULL) + +Returns: if successful: 0 + if not successful, a negative error code: + PCRE2_ERROR_NOMEMORY: failed to get memory, + or a match failure code +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr, + PCRE2_SIZE **lengthsptr) +{ +int i, count, count2; +PCRE2_SIZE size; +PCRE2_SIZE *lensp; +pcre2_memctl *memp; +PCRE2_UCHAR **listp; +PCRE2_UCHAR *sp; +PCRE2_SIZE *ovector; + +if ((count = match_data->rc) < 0) return count; /* Match failed */ +if (count == 0) count = match_data->oveccount; /* Ovector too small */ + +count2 = 2*count; +ovector = match_data->ovector; +size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *); /* For final NULL */ +if (lengthsptr != NULL) size += sizeof(PCRE2_SIZE) * count; /* For lengths */ + +for (i = 0; i < count2; i += 2) + { + size += sizeof(PCRE2_UCHAR *) + CU2BYTES(1); + if (ovector[i+1] > ovector[i]) size += CU2BYTES(ovector[i+1] - ovector[i]); + } + +memp = PRIV(memctl_malloc)(size, (pcre2_memctl *)match_data); +if (memp == NULL) return PCRE2_ERROR_NOMEMORY; + +*listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl)); +lensp = (PCRE2_SIZE *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1)); + +if (lengthsptr == NULL) + { + sp = (PCRE2_UCHAR *)lensp; + lensp = NULL; + } +else + { + *lengthsptr = lensp; + sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(PCRE2_SIZE) * count); + } + +for (i = 0; i < count2; i += 2) + { + size = (ovector[i+1] > ovector[i])? (ovector[i+1] - ovector[i]) : 0; + + /* Size == 0 includes the case when the capture is unset. Avoid adding + PCRE2_UNSET to match_data->subject because it overflows, even though with + zero size calling memcpy() is harmless. */ + + if (size != 0) memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size)); + *listp++ = sp; + if (lensp != NULL) *lensp++ = size; + sp += size; + *sp++ = 0; + } + +*listp = NULL; +return 0; +} + + + +/************************************************* +* Free memory obtained by substring_list_get * +*************************************************/ + +/* +Argument: the result of a previous pcre2_substring_list_get() +Returns: nothing +*/ + +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +pcre2_substring_list_free(PCRE2_SPTR *list) +{ +if (list != NULL) + { + pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl)); + memctl->free(memctl, memctl->memory_data); + } +} + + + +/************************************************* +* Find (multiple) entries for named string * +*************************************************/ + +/* This function scans the nametable for a given name, using binary chop. It +returns either two pointers to the entries in the table, or, if no pointers are +given, the number of a unique group with the given name. If duplicate names are +permitted, and the name is not unique, an error is generated. + +Arguments: + code the compiled regex + stringname the name whose entries required + firstptr where to put the pointer to the first entry + lastptr where to put the pointer to the last entry + +Returns: PCRE2_ERROR_NOSUBSTRING if the name is not found + otherwise, if firstptr and lastptr are NULL: + a group number for a unique substring + else PCRE2_ERROR_NOUNIQUESUBSTRING + otherwise: + the length of each entry, having set firstptr and lastptr +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname, + PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr) +{ +uint16_t bot = 0; +uint16_t top = code->name_count; +uint16_t entrysize = code->name_entry_size; +PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code)); + +while (top > bot) + { + uint16_t mid = (top + bot) / 2; + PCRE2_SPTR entry = nametable + entrysize*mid; + int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE); + if (c == 0) + { + PCRE2_SPTR first; + PCRE2_SPTR last; + PCRE2_SPTR lastentry; + lastentry = nametable + entrysize * (code->name_count - 1); + first = last = entry; + while (first > nametable) + { + if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break; + first -= entrysize; + } + while (last < lastentry) + { + if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break; + last += entrysize; + } + if (firstptr == NULL) return (first == last)? + (int)GET2(entry, 0) : PCRE2_ERROR_NOUNIQUESUBSTRING; + *firstptr = first; + *lastptr = last; + return entrysize; + } + if (c > 0) bot = mid + 1; else top = mid; + } + +return PCRE2_ERROR_NOSUBSTRING; +} + + +/************************************************* +* Find number for named string * +*************************************************/ + +/* This function is a convenience wrapper for pcre2_substring_nametable_scan() +when it is known that names are unique. If there are duplicate names, it is not +defined which number is returned. + +Arguments: + code the compiled regex + stringname the name whose number is required + +Returns: the number of the named parenthesis, or a negative number + PCRE2_ERROR_NOSUBSTRING if not found + PCRE2_ERROR_NOUNIQUESUBSTRING if not unique +*/ + +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_substring_number_from_name(const pcre2_code *code, + PCRE2_SPTR stringname) +{ +return pcre2_substring_nametable_scan(code, stringname, NULL, NULL); +} + +/* End of pcre2_substring.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_tables.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_tables.c new file mode 100644 index 0000000000..4334939d0b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_tables.c @@ -0,0 +1,1757 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains some fixed tables that are used by more than one of the +PCRE2 code modules. The tables are also #included by the pcre2test program, +which uses macros to change their names from _pcre2_xxx to xxxx, thereby +avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is +defined. */ + +#ifndef PCRE2_PCRE2TEST /* We're compiling the library */ +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif +#include "regexp/pcre2/pcre2_internal.h" +#endif /* PCRE2_PCRE2TEST */ + +/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that +the definition is next to the definition of the opcodes in pcre2_internal.h. +This is mode-dependent, so it is skipped when this file is included by +pcre2test. */ + +#ifndef PCRE2_PCRE2TEST +const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS }; +#endif + +/* Tables of horizontal and vertical whitespace characters, suitable for +adding to classes. */ + +const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST }; +const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST }; + +/* These tables are the pairs of delimiters that are valid for callout string +arguments. For each starting delimiter there must be a matching ending +delimiter, which in fact is different only for bracket-like delimiters. */ + +const uint32_t PRIV(callout_start_delims)[] = { + CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, + CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, + CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 }; + +const uint32_t PRIV(callout_end_delims[]) = { + CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, + CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, + CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 }; + + +/************************************************* +* Tables for UTF-8 support * +*************************************************/ + +/* These tables are required by pcre2test in 16- or 32-bit mode, as well +as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for +handling wide characters. */ + +#if defined PCRE2_PCRE2TEST || \ + (defined SUPPORT_UNICODE && \ + defined PCRE2_CODE_UNIT_WIDTH && \ + PCRE2_CODE_UNIT_WIDTH == 8) + +/* These are the breakpoints for different numbers of bytes in a UTF-8 +character. */ + +const int PRIV(utf8_table1)[] = + { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; + +const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int); + +/* These are the indicator bits and the mask for the data bits to set in the +first byte of a character, indexed by the number of additional bytes. */ + +const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; +const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; + +/* Table of the number of extra bytes, indexed by the first byte masked with +0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ + +const uint8_t PRIV(utf8_table4)[] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; + +#endif /* UTF-8 support needed */ + +/* Tables concerned with Unicode properties are relevant only when Unicode +support is enabled. See also the pcre2_ucptables.c file, which is generated by +a Python script from Unicode data files. */ + +#ifdef SUPPORT_UNICODE + +/* Table to translate from particular type value to the general value. */ + +const uint32_t PRIV(ucp_gentype)[] = { + ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ + ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ + ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ + ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ + ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ + ucp_P, ucp_P, /* Ps, Po */ + ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ + ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ +}; + +/* This table encodes the rules for finding the end of an extended grapheme +cluster. Every code point has a grapheme break property which is one of the +ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions +10 and 11. The 2-dimensional table is indexed by the properties of two adjacent +code points. The left property selects a word from the table, and the right +property selects a bit from that word like this: + + PRIV(ucp_gbtable)[left-property] & (1u << right-property) + +The value is non-zero if a grapheme break is NOT permitted between the relevant +two code points. The breaking rules are as follows: + +1. Break at the start and end of text (pretty obviously). + +2. Do not break between a CR and LF; otherwise, break before and after + controls. + +3. Do not break Hangul syllable sequences, the rules for which are: + + L may be followed by L, V, LV or LVT + LV or V may be followed by V or T + LVT or T may be followed by T + +4. Do not break before extending characters or zero-width-joiner (ZWJ). + +The following rules are only for extended grapheme clusters (but that's what we +are implementing). + +5. Do not break before SpacingMarks. + +6. Do not break after Prepend characters. + +7. Do not break within emoji modifier sequences or emoji zwj sequences. That + is, do not break between characters with the Extended_Pictographic property. + Extend and ZWJ characters are allowed between the characters; this cannot be + represented in this table, the code has to deal with it. + +8. Do not break within emoji flag sequences. That is, do not break between + regional indicator (RI) symbols if there are an odd number of RI characters + before the break point. This table encodes "join RI characters"; the code + has to deal with checking for previous adjoining RIs. + +9. Otherwise, break everywhere. +*/ + +#define ESZ (1< 0x10ffff is not permitted +PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted +PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence +PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence +PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence +PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) +PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) +PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) +PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff +*/ + +for (p = string; length > 0; p++) + { + uint32_t ab, d; + + c = *p; + length--; + + if (c < 128) continue; /* ASCII character */ + + if (c < 0xc0) /* Isolated 10xx xxxx byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string); + return PCRE2_ERROR_UTF8_ERR20; + } + + if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ + { + *erroroffset = (PCRE2_SIZE)(p - string); + return PCRE2_ERROR_UTF8_ERR21; + } + + ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ + if (length < ab) /* Missing bytes */ + { + *erroroffset = (PCRE2_SIZE)(p - string); + switch(ab - length) + { + case 1: return PCRE2_ERROR_UTF8_ERR1; + case 2: return PCRE2_ERROR_UTF8_ERR2; + case 3: return PCRE2_ERROR_UTF8_ERR3; + case 4: return PCRE2_ERROR_UTF8_ERR4; + case 5: return PCRE2_ERROR_UTF8_ERR5; + } + } + length -= ab; /* Length remaining */ + + /* Check top bits in the second byte */ + + if (((d = *(++p)) & 0xc0) != 0x80) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 1; + return PCRE2_ERROR_UTF8_ERR6; + } + + /* For each length, check that the remaining bytes start with the 0x80 bit + set and not the 0x40 bit. Then check for an overlong sequence, and for the + excluded range 0xd800 to 0xdfff. */ + + switch (ab) + { + /* 2-byte character. No further bytes to check for 0x80. Check first byte + for for xx00 000x (overlong sequence). */ + + case 1: if ((c & 0x3e) == 0) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 1; + return PCRE2_ERROR_UTF8_ERR15; + } + break; + + /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes + for 1110 0000, xx0x xxxx (overlong sequence) or + 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ + + case 2: + if ((*(++p) & 0xc0) != 0x80) /* Third byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 2; + return PCRE2_ERROR_UTF8_ERR7; + } + if (c == 0xe0 && (d & 0x20) == 0) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 2; + return PCRE2_ERROR_UTF8_ERR16; + } + if (c == 0xed && d >= 0xa0) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 2; + return PCRE2_ERROR_UTF8_ERR14; + } + break; + + /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 + bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a + character greater than 0x0010ffff (f4 8f bf bf) */ + + case 3: + if ((*(++p) & 0xc0) != 0x80) /* Third byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 2; + return PCRE2_ERROR_UTF8_ERR7; + } + if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 3; + return PCRE2_ERROR_UTF8_ERR8; + } + if (c == 0xf0 && (d & 0x30) == 0) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 3; + return PCRE2_ERROR_UTF8_ERR17; + } + if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 3; + return PCRE2_ERROR_UTF8_ERR13; + } + break; + + /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be + rejected by the length test below. However, we do the appropriate tests + here so that overlong sequences get diagnosed, and also in case there is + ever an option for handling these larger code points. */ + + /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for + 1111 1000, xx00 0xxx */ + + case 4: + if ((*(++p) & 0xc0) != 0x80) /* Third byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 2; + return PCRE2_ERROR_UTF8_ERR7; + } + if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 3; + return PCRE2_ERROR_UTF8_ERR8; + } + if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 4; + return PCRE2_ERROR_UTF8_ERR9; + } + if (c == 0xf8 && (d & 0x38) == 0) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 4; + return PCRE2_ERROR_UTF8_ERR18; + } + break; + + /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for + 1111 1100, xx00 00xx. */ + + case 5: + if ((*(++p) & 0xc0) != 0x80) /* Third byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 2; + return PCRE2_ERROR_UTF8_ERR7; + } + if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 3; + return PCRE2_ERROR_UTF8_ERR8; + } + if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 4; + return PCRE2_ERROR_UTF8_ERR9; + } + if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ + { + *erroroffset = (PCRE2_SIZE)(p - string) - 5; + return PCRE2_ERROR_UTF8_ERR10; + } + if (c == 0xfc && (d & 0x3c) == 0) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 5; + return PCRE2_ERROR_UTF8_ERR19; + } + break; + } + + /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are + excluded by RFC 3629. The pointer p is currently at the last byte of the + character. */ + + if (ab > 3) + { + *erroroffset = (PCRE2_SIZE)(p - string) - ab; + return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; + } + } +return 0; + + +/* ----------------- Check a UTF-16 string ----------------- */ + +#elif PCRE2_CODE_UNIT_WIDTH == 16 + +/* There's not so much work, nor so many errors, for UTF-16. +PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string +PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate +PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate +*/ + +for (p = string; length > 0; p++) + { + c = *p; + length--; + + if ((c & 0xf800) != 0xd800) + { + /* Normal UTF-16 code point. Neither high nor low surrogate. */ + } + else if ((c & 0x0400) == 0) + { + /* High surrogate. Must be a followed by a low surrogate. */ + if (length == 0) + { + *erroroffset = (PCRE2_SIZE)(p - string); + return PCRE2_ERROR_UTF16_ERR1; + } + p++; + length--; + if ((*p & 0xfc00) != 0xdc00) + { + *erroroffset = (PCRE2_SIZE)(p - string) - 1; + return PCRE2_ERROR_UTF16_ERR2; + } + } + else + { + /* Isolated low surrogate. Always an error. */ + *erroroffset = (PCRE2_SIZE)(p - string); + return PCRE2_ERROR_UTF16_ERR3; + } + } +return 0; + + + +/* ----------------- Check a UTF-32 string ----------------- */ + +#else + +/* There is very little to do for a UTF-32 string. +PCRE2_ERROR_UTF32_ERR1 Surrogate character +PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff +*/ + +for (p = string; length > 0; length--, p++) + { + c = *p; + if ((c & 0xfffff800u) != 0xd800u) + { + /* Normal UTF-32 code point. Neither high nor low surrogate. */ + if (c > 0x10ffffu) + { + *erroroffset = (PCRE2_SIZE)(p - string); + return PCRE2_ERROR_UTF32_ERR2; + } + } + else + { + /* A surrogate */ + *erroroffset = (PCRE2_SIZE)(p - string); + return PCRE2_ERROR_UTF32_ERR1; + } + } +return 0; +#endif /* CODE_UNIT_WIDTH */ +} +#endif /* SUPPORT_UNICODE */ + +/* End of pcre2_valid_utf.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_xclass.c b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_xclass.c new file mode 100644 index 0000000000..26596fd224 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/pcre2/pcre2_xclass.c @@ -0,0 +1,289 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains an internal function that is used to match an extended +class. It is used by pcre2_auto_possessify() and by both pcre2_match() and +pcre2_def_match(). */ + + +#ifdef HAVE_CONFIG_H +#include "regexp/pcre2/config.h" +#endif + + +#include "regexp/pcre2/pcre2_internal.h" + +/************************************************* +* Match character against an XCLASS * +*************************************************/ + +/* This function is called to match a character against an extended class that +might contain codepoints above 255 and/or Unicode properties. + +Arguments: + c the character + data points to the flag code unit of the XCLASS data + utf TRUE if in UTF mode + +Returns: TRUE if character matches, else FALSE +*/ + +BOOL +PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf) +{ +PCRE2_UCHAR t; +BOOL negated = (*data & XCL_NOT) != 0; + +#if PCRE2_CODE_UNIT_WIDTH == 8 +/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ +utf = TRUE; +#endif + +/* Code points < 256 are matched against a bitmap, if one is present. If not, +we still carry on, because there may be ranges that start below 256 in the +additional data. */ + +if (c < 256) + { + if ((*data & XCL_HASPROP) == 0) + { + if ((*data & XCL_MAP) == 0) return negated; + return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0; + } + if ((*data & XCL_MAP) != 0 && + (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0) + return !negated; /* char found */ + } + +/* First skip the bit map if present. Then match against the list of Unicode +properties or large chars or ranges that end with a large char. We won't ever +encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */ + +if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR); + +while ((t = *data++) != XCL_END) + { + uint32_t x, y; + if (t == XCL_SINGLE) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + } + else +#endif + x = *data++; + if (c == x) return !negated; + } + else if (t == XCL_RANGE) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + GETCHARINC(y, data); /* macro generates multiple statements */ + } + else +#endif + { + x = *data++; + y = *data++; + } + if (c >= x && c <= y) return !negated; + } + +#ifdef SUPPORT_UNICODE + else /* XCL_PROP & XCL_NOTPROP */ + { + const ucd_record *prop = GET_UCD(c); + BOOL isprop = t == XCL_PROP; + BOOL ok; + + switch(*data) + { + case PT_ANY: + if (isprop) return !negated; + break; + + case PT_LAMP: + if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == isprop) return !negated; + break; + + case PT_GC: + if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop) + return !negated; + break; + + case PT_PC: + if ((data[1] == prop->chartype) == isprop) return !negated; + break; + + case PT_SC: + if ((data[1] == prop->script) == isprop) return !negated; + break; + + case PT_SCX: + ok = (data[1] == prop->script || + MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0); + if (ok == isprop) return !negated; + break; + + case PT_ALNUM: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop) + return !negated; + break; + + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (isprop) return !negated; + break; + + default: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop) + return !negated; + break; + } + break; + + case PT_WORD: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) + == isprop) + return !negated; + break; + + case PT_UCNC: + if (c < 0xa0) + { + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT) == isprop) + return !negated; + } + else + { + if ((c < 0xd800 || c > 0xdfff) == isprop) + return !negated; + } + break; + + case PT_BIDICL: + if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop) + return !negated; + break; + + case PT_BOOL: + ok = MAPBIT(PRIV(ucd_boolprop_sets) + + UCD_BPROPS_PROP(prop), data[1]) != 0; + if (ok == isprop) return !negated; + break; + + /* The following three properties can occur only in an XCLASS, as there + is no \p or \P coding for them. */ + + /* Graphic character. Implement this as not Z (space or separator) and + not C (other), except for Cf (format) with a few exceptions. This seems + to be what Perl does. The exceptional characters are: + + U+061C Arabic Letter Mark + U+180E Mongolian Vowel Separator + U+2066 - U+2069 Various "isolate"s + */ + + case PT_PXGRAPH: + if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z && + (PRIV(ucp_gentype)[prop->chartype] != ucp_C || + (prop->chartype == ucp_Cf && + c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069)) + )) == isprop) + return !negated; + break; + + /* Printable character: same as graphic, with the addition of Zs, i.e. + not Zl and not Zp, and U+180E. */ + + case PT_PXPRINT: + if ((prop->chartype != ucp_Zl && + prop->chartype != ucp_Zp && + (PRIV(ucp_gentype)[prop->chartype] != ucp_C || + (prop->chartype == ucp_Cf && + c != 0x061c && (c < 0x2066 || c > 0x2069)) + )) == isprop) + return !negated; + break; + + /* Punctuation: all Unicode punctuation, plus ASCII characters that + Unicode treats as symbols rather than punctuation, for Perl + compatibility (these are $+<=>^`|~). */ + + case PT_PXPUNCT: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || + (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) + return !negated; + break; + + /* This should never occur, but compilers may mutter if there is no + default. */ + + default: + return FALSE; + } + + data += 2; + } +#else + (void)utf; /* Avoid compiler warning */ +#endif /* SUPPORT_UNICODE */ + } + +return negated; /* char did not match */ +} + +/* End of pcre2_xclass.c */ diff --git a/libsql-ffi/bundled/sqlean/regexp/regexp.c b/libsql-ffi/bundled/sqlean/regexp/regexp.c new file mode 100644 index 0000000000..d786ddd0ed --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/regexp.c @@ -0,0 +1,157 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +/* + * PCRE wrapper. + */ + +#include +#include +#include +#include + +#include "regexp/pcre2/pcre2.h" +#include "regexp/regexp.h" + +// regexp_compile compiles and returns the compiled regexp. +pcre2_code* regexp_compile(const char* pattern) { + size_t erroffset; + int errcode; + uint32_t options = PCRE2_UCP | PCRE2_UTF; + pcre2_code* re = pcre2_compile((PCRE2_SPTR8)pattern, PCRE2_ZERO_TERMINATED, options, &errcode, + &erroffset, NULL); + return re; +} + +// regexp_free frees the compiled regexp. +void regexp_free(pcre2_code* re) { + pcre2_code_free(re); +} + +// regexp_get_error returns the error message for a given pattern. +char* regexp_get_error(const char* pattern) { + size_t erroffset; + int errcode; + uint32_t options = PCRE2_UCP | PCRE2_UTF; + pcre2_code* re = pcre2_compile((PCRE2_SPTR8)pattern, PCRE2_ZERO_TERMINATED, options, &errcode, + &erroffset, NULL); + + if (re != NULL) { + // free the compiled pattern if successful + pcre2_code_free(re); + return NULL; + } + + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(errcode, buffer, sizeof(buffer)); + + // Allocate memory for the error message + // (additional space for formatting) + char* msg = (char*)malloc(256 + 32); + if (msg != NULL) { + snprintf(msg, 256 + 32, "%s (offset %d)", buffer, (int)erroffset); + } + return msg; +} + +// regexp_like checks if source string matches pattern. +// Returns: +// -1 if the pattern is invalid +// 0 if there is no match +// 1 if there is a match +int regexp_like(pcre2_code* re, const char* source) { + if (re == NULL) { + return -1; + } + + pcre2_match_data* match_data; + match_data = pcre2_match_data_create_from_pattern(re, NULL); + + size_t source_len = strlen(source); + + int rc = pcre2_match(re, (const unsigned char*)source, source_len, 0, 0, match_data, NULL); + + pcre2_match_data_free(match_data); + + if (rc <= 0) { + return 0; + } else { + return 1; + } +} + +// regexp_extract extracts source substring matching pattern into substr. +// If group_idx > 0, returns the corresponding group instead of the whole matched substring. +// Returns: +// -1 if the pattern is invalid +// 0 if there is no match +// 1 if there is a match +int regexp_extract(pcre2_code* re, const char* source, size_t group_idx, char** substr) { + if (re == NULL) { + return -1; + } + + pcre2_match_data* match_data; + match_data = pcre2_match_data_create_from_pattern(re, NULL); + + int rc = pcre2_match(re, (const unsigned char*)source, PCRE2_ZERO_TERMINATED, 0, 0, match_data, + NULL); + + if (rc <= 0) { + pcre2_match_data_free(match_data); + return 0; + } + + if (group_idx >= (size_t)rc) { + pcre2_match_data_free(match_data); + return 0; + } + + size_t* ovector = pcre2_get_ovector_pointer(match_data); + + const char* substr_start = source + ovector[2 * group_idx]; + size_t substr_len = ovector[2 * group_idx + 1] - ovector[2 * group_idx]; + + *substr = malloc(substr_len + 1); + memcpy(*substr, substr_start, substr_len); + (*substr)[substr_len] = '\0'; + + pcre2_match_data_free(match_data); + return 1; +} + +// regexp_replace replaces matching substring with replacement string into `dest`. +// Returns: +// -1 if the pattern is invalid +// 0 if there is no match +// 1 if there is a match +int regexp_replace(pcre2_code* re, const char* source, const char* repl, char** dest) { + if (re == NULL) { + return -1; + } + + pcre2_match_data* match_data; + match_data = pcre2_match_data_create_from_pattern(re, NULL); + + const int options = PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED; + size_t source_len = strlen(source); + size_t outlen = source_len + 1024; + char* output = malloc(outlen); + int rc = pcre2_substitute(re, (const unsigned char*)source, PCRE2_ZERO_TERMINATED, 0, options, + match_data, NULL, (const unsigned char*)repl, PCRE2_ZERO_TERMINATED, + (unsigned char*)output, &outlen); + + if (rc <= 0) { + pcre2_match_data_free(match_data); + free(output); + return 0; + } + + *dest = malloc(outlen + 1); + memcpy(*dest, output, outlen); + (*dest)[outlen] = '\0'; + + pcre2_match_data_free(match_data); + free(output); + return 1; +} diff --git a/libsql-ffi/bundled/sqlean/regexp/regexp.h b/libsql-ffi/bundled/sqlean/regexp/regexp.h new file mode 100644 index 0000000000..84a8f20448 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/regexp/regexp.h @@ -0,0 +1,16 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +#ifndef REGEXP_H +#define REGEXP_H + +#include "regexp/pcre2/pcre2.h" + +pcre2_code* regexp_compile(const char* pattern); +void regexp_free(pcre2_code* re); +char* regexp_get_error(const char* pattern); +int regexp_like(pcre2_code* re, const char* source); +int regexp_extract(pcre2_code* re, const char* source, size_t group_idx, char** substr); +int regexp_replace(pcre2_code* re, const char* source, const char* repl, char** dest); + +#endif /* REGEXP_H */ diff --git a/libsql-ffi/bundled/sqlean/sqlean.h b/libsql-ffi/bundled/sqlean/sqlean.h new file mode 100644 index 0000000000..b148450e46 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlean.h @@ -0,0 +1,11 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +#ifndef SQLEAN_H +#define SQLEAN_H + +#ifndef SQLEAN_VERSION +#define SQLEAN_VERSION "main" +#endif + +#endif /* SQLEAN_H */ diff --git a/libsql-ffi/bundled/sqlean/sqlite3-crypto.c b/libsql-ffi/bundled/sqlean/sqlite3-crypto.c new file mode 100644 index 0000000000..8b295f5aaa --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-crypto.c @@ -0,0 +1,26 @@ +// Copyright (c) 2021 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite hash and encode/decode functions. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "crypto/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_crypto_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return crypto_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-define.c b/libsql-ffi/bundled/sqlean/sqlite3-define.c new file mode 100644 index 0000000000..0ce6b984ed --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-define.c @@ -0,0 +1,26 @@ +// Copyright (c) 2022 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// User-defined functions in SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "define/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_define_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return define_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-fileio.c b/libsql-ffi/bundled/sqlean/sqlite3-fileio.c new file mode 100644 index 0000000000..51168f84f2 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-fileio.c @@ -0,0 +1,29 @@ +// Originally by D. Richard Hipp, Public Domain +// https://www.sqlite.org/src/file/ext/misc/fileio.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// Read and write files in SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "fileio/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_fileio_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return fileio_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-fuzzy.c b/libsql-ffi/bundled/sqlean/sqlite3-fuzzy.c new file mode 100644 index 0000000000..50a834f39c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-fuzzy.c @@ -0,0 +1,26 @@ +// Copyright (c) 2021 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Fuzzy string matching and phonetics. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "fuzzy/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_fuzzy_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return fuzzy_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-ipaddr.c b/libsql-ffi/bundled/sqlean/sqlite3-ipaddr.c new file mode 100644 index 0000000000..234e58043c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-ipaddr.c @@ -0,0 +1,26 @@ +// Copyright (c) 2021 Vincent Bernat, MIT License +// https://github.com/nalgeon/sqlean + +// IP address manipulation in SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "ipaddr/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_ipaddr_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return ipaddr_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-math.c b/libsql-ffi/bundled/sqlean/sqlite3-math.c new file mode 100644 index 0000000000..2497fca068 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-math.c @@ -0,0 +1,28 @@ +// Originally from SQLite 3.42.0 source code (func.c), Public Domain + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// SQLite math functions. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "math/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_math_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return math_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-regexp.c b/libsql-ffi/bundled/sqlean/sqlite3-regexp.c new file mode 100644 index 0000000000..6c7f0eebb5 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-regexp.c @@ -0,0 +1,26 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with regular expressions. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "regexp/extension.h" +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_regexp_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return regexp_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-sqlean.c b/libsql-ffi/bundled/sqlean/sqlite3-sqlean.c new file mode 100755 index 0000000000..4ad79de174 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-sqlean.c @@ -0,0 +1,59 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Sqlean extensions bundle. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +// include most of the extensions, +#include "crypto/extension.h" +#include "define/extension.h" +#include "fileio/extension.h" +#include "fuzzy/extension.h" +#if !defined(_WIN32) +#include "ipaddr/extension.h" +#endif +#include "math/extension.h" +#include "regexp/extension.h" +#include "stats/extension.h" +#include "text/extension.h" +#include "time/extension.h" +#include "unicode/extension.h" +#include "uuid/extension.h" +#include "vsv/extension.h" + +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_sqlean_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + crypto_init(db); + define_init(db); + fileio_init(db); + fuzzy_init(db); +#if !defined(_WIN32) + ipaddr_init(db); +#endif + math_init(db); + regexp_init(db); + stats_init(db); + text_init(db); +#if !defined(_WIN32) || defined(_WIN64) + time_init(db); +#endif + unicode_init(db); + uuid_init(db); + vsv_init(db); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-stats.c b/libsql-ffi/bundled/sqlean/sqlite3-stats.c new file mode 100644 index 0000000000..ac5ce80f13 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-stats.c @@ -0,0 +1,26 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Statistical functions for SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "sqlean.h" +#include "stats/extension.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_stats_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return stats_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-text.c b/libsql-ffi/bundled/sqlean/sqlite3-text.c new file mode 100644 index 0000000000..b4b683e828 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-text.c @@ -0,0 +1,26 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with text. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "sqlean.h" +#include "text/extension.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_text_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return text_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-time.c b/libsql-ffi/bundled/sqlean/sqlite3-time.c new file mode 100644 index 0000000000..b0e99a3f98 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-time.c @@ -0,0 +1,26 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with time. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "sqlean.h" +#include "time/extension.h" + +// sqlean_version returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_time_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return time_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-unicode.c b/libsql-ffi/bundled/sqlean/sqlite3-unicode.c new file mode 100644 index 0000000000..a7bc2b826b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-unicode.c @@ -0,0 +1,26 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Unicode support for SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "sqlean.h" +#include "unicode/extension.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_unicode_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return unicode_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-uuid.c b/libsql-ffi/bundled/sqlean/sqlite3-uuid.c new file mode 100644 index 0000000000..c18d0f5f22 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-uuid.c @@ -0,0 +1,26 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Universally Unique IDentifiers (UUIDs) in SQLite + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "sqlean.h" +#include "uuid/extension.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_uuid_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return uuid_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/sqlite3-vsv.c b/libsql-ffi/bundled/sqlean/sqlite3-vsv.c new file mode 100755 index 0000000000..b88719dd13 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/sqlite3-vsv.c @@ -0,0 +1,26 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// CSV files as virtual tables in SQLite + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +#include "sqlean.h" +#include "vsv/extension.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_vsv_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + return vsv_init(db); +} diff --git a/libsql-ffi/bundled/sqlean/stats/extension.c b/libsql-ffi/bundled/sqlean/stats/extension.c new file mode 100644 index 0000000000..d974e35190 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/stats/extension.c @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Statistical functions for SQLite. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "stats/stats.h" + +int stats_init(sqlite3* db) { + stats_scalar_init(db); + stats_series_init(db); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/stats/extension.h b/libsql-ffi/bundled/sqlean/stats/extension.h new file mode 100644 index 0000000000..e8f530499d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/stats/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Statistical functions for SQLite. + +#ifndef STATS_EXTENSION_H +#define STATS_EXTENSION_H + +#include "sqlite3ext.h" + +int stats_init(sqlite3* db); + +#endif /* STATS_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/stats/scalar.c b/libsql-ffi/bundled/sqlean/stats/scalar.c new file mode 100644 index 0000000000..47d491340d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/stats/scalar.c @@ -0,0 +1,330 @@ +// Standard deviation and variance by Liam Healy, Public Domain +// extension-functions.c at https://sqlite.org/contrib/ + +// Percentile by D. Richard Hipp, Public Domain +// https://sqlite.org/src/file/ext/misc/percentile.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Statistical functions for SQLite. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#pragma region Standard deviation and variance + +/* +** An instance of the following structure holds the context of a +** stddev() or variance() aggregate computation. +** implementaion of http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Algorithm_II +** less prone to rounding errors +*/ +typedef struct StddevCtx StddevCtx; +struct StddevCtx { + double rM; + double rS; + int64_t cnt; /* number of elements */ +}; + +/* +** called for each value received during a calculation of stddev or variance +*/ +static void varianceStep(sqlite3_context* context, int argc, sqlite3_value** argv) { + StddevCtx* p; + + double delta; + double x; + + assert(argc == 1); + p = sqlite3_aggregate_context(context, sizeof(*p)); + /* only consider non-null values */ + if (SQLITE_NULL != sqlite3_value_numeric_type(argv[0])) { + p->cnt++; + x = sqlite3_value_double(argv[0]); + delta = (x - p->rM); + p->rM += delta / p->cnt; + p->rS += delta * (x - p->rM); + } +} + +/* +** Returns the sample standard deviation value +*/ +static void stddevFinalize(sqlite3_context* context) { + StddevCtx* p; + p = sqlite3_aggregate_context(context, 0); + if (p && p->cnt > 1) { + sqlite3_result_double(context, sqrt(p->rS / (p->cnt - 1))); + } else { + sqlite3_result_double(context, 0.0); + } +} + +/* +** Returns the population standard deviation value +*/ +static void stddevpopFinalize(sqlite3_context* context) { + StddevCtx* p; + p = sqlite3_aggregate_context(context, 0); + if (p && p->cnt > 1) { + sqlite3_result_double(context, sqrt(p->rS / p->cnt)); + } else { + sqlite3_result_double(context, 0.0); + } +} + +/* +** Returns the sample variance value +*/ +static void varianceFinalize(sqlite3_context* context) { + StddevCtx* p; + p = sqlite3_aggregate_context(context, 0); + if (p && p->cnt > 1) { + sqlite3_result_double(context, p->rS / (p->cnt - 1)); + } else { + sqlite3_result_double(context, 0.0); + } +} + +/* +** Returns the population variance value +*/ +static void variancepopFinalize(sqlite3_context* context) { + StddevCtx* p; + p = sqlite3_aggregate_context(context, 0); + if (p && p->cnt > 1) { + sqlite3_result_double(context, p->rS / p->cnt); + } else { + sqlite3_result_double(context, 0.0); + } +} + +#pragma endregion + +#pragma region Percentile + +/* The following object is the session context for a single percentile() +** function. We have to remember all input Y values until the very end. +** Those values are accumulated in the Percentile.a[] array. +*/ +typedef struct Percentile Percentile; +struct Percentile { + unsigned nAlloc; /* Number of slots allocated for a[] */ + unsigned nUsed; /* Number of slots actually used in a[] */ + double rPct; /* 1.0 more than the value for P */ + double* a; /* Array of Y values */ +}; + +/* +** Return TRUE if the input floating-point number is an infinity. +*/ +static int isInfinity(double r) { + sqlite3_uint64 u; + assert(sizeof(u) == sizeof(r)); + memcpy(&u, &r, sizeof(u)); + return ((u >> 52) & 0x7ff) == 0x7ff; +} + +/* +** Return TRUE if two doubles differ by 0.001 or less +*/ +static int sameValue(double a, double b) { + a -= b; + return a >= -0.001 && a <= 0.001; +} + +/* +** The "step" function for percentile(Y,P) is called once for each +** input row. +*/ +static void percentStep(sqlite3_context* pCtx, double rPct, int argc, sqlite3_value** argv) { + Percentile* p; + int eType; + double y; + + /* Allocate the session context. */ + p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p)); + if (p == 0) + return; + + /* Remember the P value. Throw an error if the P value is different + ** from any prior row, per Requirement (2). */ + if (p->rPct == 0.0) { + p->rPct = rPct + 1.0; + } else if (!sameValue(p->rPct, rPct + 1.0)) { + sqlite3_result_error(pCtx, + "2nd argument to percentile() is not the " + "same for all input rows", + -1); + return; + } + + /* Ignore rows for which Y is NULL */ + eType = sqlite3_value_type(argv[0]); + if (eType == SQLITE_NULL) + return; + + /* If not NULL, then Y must be numeric. Otherwise throw an error. + ** Requirement 4 */ + if (eType != SQLITE_INTEGER && eType != SQLITE_FLOAT) { + sqlite3_result_error(pCtx, + "1st argument to percentile() is not " + "numeric", + -1); + return; + } + + /* Throw an error if the Y value is infinity or NaN */ + y = sqlite3_value_double(argv[0]); + if (isInfinity(y)) { + sqlite3_result_error(pCtx, "Inf input to percentile()", -1); + return; + } + + /* Allocate and store the Y */ + if (p->nUsed >= p->nAlloc) { + unsigned n = p->nAlloc * 2 + 250; + double* a = sqlite3_realloc64(p->a, sizeof(double) * n); + if (a == 0) { + sqlite3_free(p->a); + memset(p, 0, sizeof(*p)); + sqlite3_result_error_nomem(pCtx); + return; + } + p->nAlloc = n; + p->a = a; + } + p->a[p->nUsed++] = y; +} + +static void percentStepCustom(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 2); + /* Requirement 3: P must be a number between 0 and 100 */ + int eType = sqlite3_value_numeric_type(argv[1]); + double rPct = sqlite3_value_double(argv[1]); + if ((eType != SQLITE_INTEGER && eType != SQLITE_FLOAT) || rPct < 0.0 || rPct > 100.0) { + sqlite3_result_error(pCtx, + "2nd argument to percentile() should be " + "a number between 0.0 and 100.0", + -1); + return; + } + percentStep(pCtx, rPct, argc, argv); +} + +static void percentStep25(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 1); + percentStep(pCtx, 25, argc, argv); +} + +static void percentStep50(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 1); + percentStep(pCtx, 50, argc, argv); +} + +static void percentStep75(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 1); + percentStep(pCtx, 75, argc, argv); +} + +static void percentStep90(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 1); + percentStep(pCtx, 90, argc, argv); +} + +static void percentStep95(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 1); + percentStep(pCtx, 95, argc, argv); +} + +static void percentStep99(sqlite3_context* pCtx, int argc, sqlite3_value** argv) { + assert(argc == 1); + percentStep(pCtx, 99, argc, argv); +} + +/* +** Compare to doubles for sorting using qsort() +*/ +static int SQLITE_CDECL doubleCmp(const void* pA, const void* pB) { + double a = *(double*)pA; + double b = *(double*)pB; + if (a == b) + return 0; + if (a < b) + return -1; + return +1; +} + +/* +** Called to compute the final output of percentile() and to clean +** up all allocated memory. +*/ +static void percentFinal(sqlite3_context* pCtx) { + Percentile* p; + unsigned i1, i2; + double v1, v2; + double ix, vx; + p = (Percentile*)sqlite3_aggregate_context(pCtx, 0); + if (p == 0) + return; + if (p->a == 0) + return; + if (p->nUsed) { + qsort(p->a, p->nUsed, sizeof(double), doubleCmp); + ix = (p->rPct - 1.0) * (p->nUsed - 1) * 0.01; + i1 = (unsigned)ix; + i2 = ix == (double)i1 || i1 == p->nUsed - 1 ? i1 : i1 + 1; + v1 = p->a[i1]; + v2 = p->a[i2]; + vx = v1 + (v2 - v1) * (ix - i1); + sqlite3_result_double(pCtx, vx); + } + sqlite3_free(p->a); + memset(p, 0, sizeof(*p)); +} + +#pragma endregion + +int stats_scalar_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS; + sqlite3_create_function(db, "stats_stddev", 1, flags, 0, 0, varianceStep, stddevFinalize); + sqlite3_create_function(db, "stats_stddev_samp", 1, flags, 0, 0, varianceStep, stddevFinalize); + sqlite3_create_function(db, "stats_stddev_pop", 1, flags, 0, 0, varianceStep, + stddevpopFinalize); + sqlite3_create_function(db, "stats_var", 1, flags, 0, 0, varianceStep, varianceFinalize); + sqlite3_create_function(db, "stats_var_samp", 1, flags, 0, 0, varianceStep, varianceFinalize); + sqlite3_create_function(db, "stats_var_pop", 1, flags, 0, 0, varianceStep, variancepopFinalize); + sqlite3_create_function(db, "stats_median", 1, flags, 0, 0, percentStep50, percentFinal); + sqlite3_create_function(db, "stats_perc", 2, flags, 0, 0, percentStepCustom, percentFinal); + sqlite3_create_function(db, "stats_p25", 1, flags, 0, 0, percentStep25, percentFinal); + sqlite3_create_function(db, "stats_p75", 1, flags, 0, 0, percentStep75, percentFinal); + sqlite3_create_function(db, "stats_p90", 1, flags, 0, 0, percentStep90, percentFinal); + sqlite3_create_function(db, "stats_p95", 1, flags, 0, 0, percentStep95, percentFinal); + sqlite3_create_function(db, "stats_p99", 1, flags, 0, 0, percentStep99, percentFinal); + + sqlite3_create_function(db, "stddev", 1, flags, 0, 0, varianceStep, stddevFinalize); + sqlite3_create_function(db, "stddev_samp", 1, flags, 0, 0, varianceStep, stddevFinalize); + sqlite3_create_function(db, "stddev_pop", 1, flags, 0, 0, varianceStep, stddevpopFinalize); + sqlite3_create_function(db, "variance", 1, flags, 0, 0, varianceStep, varianceFinalize); + sqlite3_create_function(db, "var_samp", 1, flags, 0, 0, varianceStep, varianceFinalize); + sqlite3_create_function(db, "var_pop", 1, flags, 0, 0, varianceStep, variancepopFinalize); + sqlite3_create_function(db, "median", 1, flags, 0, 0, percentStep50, percentFinal); + sqlite3_create_function(db, "percentile", 2, flags, 0, 0, percentStepCustom, percentFinal); + sqlite3_create_function(db, "percentile_25", 1, flags, 0, 0, percentStep25, percentFinal); + sqlite3_create_function(db, "percentile_75", 1, flags, 0, 0, percentStep75, percentFinal); + sqlite3_create_function(db, "percentile_90", 1, flags, 0, 0, percentStep90, percentFinal); + sqlite3_create_function(db, "percentile_95", 1, flags, 0, 0, percentStep95, percentFinal); + sqlite3_create_function(db, "percentile_99", 1, flags, 0, 0, percentStep99, percentFinal); + + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/stats/series.c b/libsql-ffi/bundled/sqlean/stats/series.c new file mode 100644 index 0000000000..6fb8503a1c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/stats/series.c @@ -0,0 +1,372 @@ +// Originally by D. Richard Hipp, Public Domain +// https://sqlite.org/src/file/ext/misc/series.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// generate_series function. + +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +/* series_cursor is a subclass of sqlite3_vtab_cursor which will +** serve as the underlying representation of a cursor that scans +** over rows of the result +*/ +typedef struct series_cursor series_cursor; +struct series_cursor { + sqlite3_vtab_cursor base; /* Base class - must be first */ + int isDesc; /* True to count down rather than up */ + sqlite3_int64 iRowid; /* The rowid */ + sqlite3_int64 iValue; /* Current value ("value") */ + sqlite3_int64 mnValue; /* Mimimum value ("start") */ + sqlite3_int64 mxValue; /* Maximum value ("stop") */ + sqlite3_int64 iStep; /* Increment ("step") */ +}; + +/* +** The seriesConnect() method is invoked to create a new +** series_vtab that describes the generate_series virtual table. +** +** Think of this routine as the constructor for series_vtab objects. +** +** All this routine needs to do is: +** +** (1) Allocate the series_vtab object and initialize all fields. +** +** (2) Tell SQLite (via the sqlite3_declare_vtab() interface) what the +** result set of queries against generate_series will look like. +*/ +static int seriesConnect(sqlite3* db, + void* pUnused, + int argcUnused, + const char* const* argvUnused, + sqlite3_vtab** ppVtab, + char** pzErrUnused) { + sqlite3_vtab* pNew; + int rc; + +/* Column numbers */ +#define SERIES_COLUMN_VALUE 0 +#define SERIES_COLUMN_START 1 +#define SERIES_COLUMN_STOP 2 +#define SERIES_COLUMN_STEP 3 + + (void)pUnused; + (void)argcUnused; + (void)argvUnused; + (void)pzErrUnused; + rc = sqlite3_declare_vtab(db, "CREATE TABLE x(value,start hidden,stop hidden,step hidden)"); + if (rc == SQLITE_OK) { + pNew = *ppVtab = sqlite3_malloc(sizeof(*pNew)); + if (pNew == 0) + return SQLITE_NOMEM; + memset(pNew, 0, sizeof(*pNew)); + sqlite3_vtab_config(db, SQLITE_VTAB_INNOCUOUS); + } + return rc; +} + +/* +** This method is the destructor for series_cursor objects. +*/ +static int seriesDisconnect(sqlite3_vtab* pVtab) { + sqlite3_free(pVtab); + return SQLITE_OK; +} + +/* +** Constructor for a new series_cursor object. +*/ +static int seriesOpen(sqlite3_vtab* pUnused, sqlite3_vtab_cursor** ppCursor) { + series_cursor* pCur; + (void)pUnused; + pCur = sqlite3_malloc(sizeof(*pCur)); + if (pCur == 0) + return SQLITE_NOMEM; + memset(pCur, 0, sizeof(*pCur)); + *ppCursor = &pCur->base; + return SQLITE_OK; +} + +/* +** Destructor for a series_cursor. +*/ +static int seriesClose(sqlite3_vtab_cursor* cur) { + sqlite3_free(cur); + return SQLITE_OK; +} + +/* +** Advance a series_cursor to its next row of output. +*/ +static int seriesNext(sqlite3_vtab_cursor* cur) { + series_cursor* pCur = (series_cursor*)cur; + if (pCur->isDesc) { + pCur->iValue -= pCur->iStep; + } else { + pCur->iValue += pCur->iStep; + } + pCur->iRowid++; + return SQLITE_OK; +} + +/* +** Return values of columns for the row at which the series_cursor +** is currently pointing. +*/ +static int seriesColumn(sqlite3_vtab_cursor* cur, /* The cursor */ + sqlite3_context* ctx, /* First argument to sqlite3_result_...() */ + int i /* Which column to return */ +) { + series_cursor* pCur = (series_cursor*)cur; + sqlite3_int64 x = 0; + switch (i) { + case SERIES_COLUMN_START: + x = pCur->mnValue; + break; + case SERIES_COLUMN_STOP: + x = pCur->mxValue; + break; + case SERIES_COLUMN_STEP: + x = pCur->iStep; + break; + default: + x = pCur->iValue; + break; + } + sqlite3_result_int64(ctx, x); + return SQLITE_OK; +} + +/* +** Return the rowid for the current row. In this implementation, the +** first row returned is assigned rowid value 1, and each subsequent +** row a value 1 more than that of the previous. +*/ +static int seriesRowid(sqlite3_vtab_cursor* cur, sqlite_int64* pRowid) { + series_cursor* pCur = (series_cursor*)cur; + *pRowid = pCur->iRowid; + return SQLITE_OK; +} + +/* +** Return TRUE if the cursor has been moved off of the last +** row of output. +*/ +static int seriesEof(sqlite3_vtab_cursor* cur) { + series_cursor* pCur = (series_cursor*)cur; + if (pCur->isDesc) { + return pCur->iValue < pCur->mnValue; + } else { + return pCur->iValue > pCur->mxValue; + } +} + +/* True to cause run-time checking of the start=, stop=, and/or step= +** parameters. The only reason to do this is for testing the +** constraint checking logic for virtual tables in the SQLite core. +*/ +#ifndef SQLITE_SERIES_CONSTRAINT_VERIFY +#define SQLITE_SERIES_CONSTRAINT_VERIFY 0 +#endif + +/* +** This method is called to "rewind" the series_cursor object back +** to the first row of output. This method is always called at least +** once prior to any call to seriesColumn() or seriesRowid() or +** seriesEof(). +** +** The query plan selected by seriesBestIndex is passed in the idxNum +** parameter. (idxStr is not used in this implementation.) idxNum +** is a bitmask showing which constraints are available: +** +** 1: start=VALUE +** 2: stop=VALUE +** 4: step=VALUE +** +** Also, if bit 8 is set, that means that the series should be output +** in descending order rather than in ascending order. If bit 16 is +** set, then output must appear in ascending order. +** +** This routine should initialize the cursor and position it so that it +** is pointing at the first row, or pointing off the end of the table +** (so that seriesEof() will return true) if the table is empty. +*/ +static int seriesFilter(sqlite3_vtab_cursor* pVtabCursor, + int idxNum, + const char* idxStrUnused, + int argc, + sqlite3_value** argv) { + series_cursor* pCur = (series_cursor*)pVtabCursor; + int i = 0; + (void)idxStrUnused; + if (idxNum & 1) { + pCur->mnValue = sqlite3_value_int64(argv[i++]); + } else { + pCur->mnValue = 0; + } + if (idxNum & 2) { + pCur->mxValue = sqlite3_value_int64(argv[i++]); + } else { + pCur->mxValue = 0xffffffff; + } + if (idxNum & 4) { + pCur->iStep = sqlite3_value_int64(argv[i++]); + if (pCur->iStep == 0) { + pCur->iStep = 1; + } else if (pCur->iStep < 0) { + pCur->iStep = -pCur->iStep; + if ((idxNum & 16) == 0) + idxNum |= 8; + } + } else { + pCur->iStep = 1; + } + for (i = 0; i < argc; i++) { + if (sqlite3_value_type(argv[i]) == SQLITE_NULL) { + /* If any of the constraints have a NULL value, then return no rows. + ** See ticket https://www.sqlite.org/src/info/fac496b61722daf2 */ + pCur->mnValue = 1; + pCur->mxValue = 0; + break; + } + } + if (idxNum & 8) { + pCur->isDesc = 1; + pCur->iValue = pCur->mxValue; + if (pCur->iStep > 0) { + pCur->iValue -= (pCur->mxValue - pCur->mnValue) % pCur->iStep; + } + } else { + pCur->isDesc = 0; + pCur->iValue = pCur->mnValue; + } + pCur->iRowid = 1; + return SQLITE_OK; +} + +/* +** SQLite will invoke this method one or more times while planning a query +** that uses the generate_series virtual table. This routine needs to create +** a query plan for each invocation and compute an estimated cost for that +** plan. +** +** In this implementation idxNum is used to represent the +** query plan. idxStr is unused. +** +** The query plan is represented by bits in idxNum: +** +** (1) start = $value -- constraint exists +** (2) stop = $value -- constraint exists +** (4) step = $value -- constraint exists +** (8) output in descending order +*/ +static int seriesBestIndex(sqlite3_vtab* pVTab, sqlite3_index_info* pIdxInfo) { + int i, j; /* Loop over constraints */ + int idxNum = 0; /* The query plan bitmask */ + int bStartSeen = 0; /* EQ constraint seen on the START column */ + int unusableMask = 0; /* Mask of unusable constraints */ + int nArg = 0; /* Number of arguments that seriesFilter() expects */ + int aIdx[3]; /* Constraints on start, stop, and step */ + const struct sqlite3_index_constraint* pConstraint; + + /* This implementation assumes that the start, stop, and step columns + ** are the last three columns in the virtual table. */ + assert(SERIES_COLUMN_STOP == SERIES_COLUMN_START + 1); + assert(SERIES_COLUMN_STEP == SERIES_COLUMN_START + 2); + + aIdx[0] = aIdx[1] = aIdx[2] = -1; + pConstraint = pIdxInfo->aConstraint; + for (i = 0; i < pIdxInfo->nConstraint; i++, pConstraint++) { + int iCol; /* 0 for start, 1 for stop, 2 for step */ + int iMask; /* bitmask for those column */ + if (pConstraint->iColumn < SERIES_COLUMN_START) + continue; + iCol = pConstraint->iColumn - SERIES_COLUMN_START; + assert(iCol >= 0 && iCol <= 2); + iMask = 1 << iCol; + if (iCol == 0) + bStartSeen = 1; + if (pConstraint->usable == 0) { + unusableMask |= iMask; + continue; + } else if (pConstraint->op == SQLITE_INDEX_CONSTRAINT_EQ) { + idxNum |= iMask; + aIdx[iCol] = i; + } + } + for (i = 0; i < 3; i++) { + if ((j = aIdx[i]) >= 0) { + pIdxInfo->aConstraintUsage[j].argvIndex = ++nArg; + pIdxInfo->aConstraintUsage[j].omit = !SQLITE_SERIES_CONSTRAINT_VERIFY; + } + } + /* The current generate_column() implementation requires at least one + ** argument (the START value). Legacy versions assumed START=0 if the + ** first argument was omitted. Compile with -DZERO_ARGUMENT_GENERATE_SERIES + ** to obtain the legacy behavior */ +#ifndef ZERO_ARGUMENT_GENERATE_SERIES + if (!bStartSeen) { + sqlite3_free(pVTab->zErrMsg); + pVTab->zErrMsg = + sqlite3_mprintf("first argument to \"generate_series()\" missing or unusable"); + return SQLITE_ERROR; + } +#endif + if ((unusableMask & ~idxNum) != 0) { + /* The start, stop, and step columns are inputs. Therefore if there + ** are unusable constraints on any of start, stop, or step then + ** this plan is unusable */ + return SQLITE_CONSTRAINT; + } + if ((idxNum & 3) == 3) { + /* Both start= and stop= boundaries are available. This is the + ** the preferred case */ + pIdxInfo->estimatedCost = (double)(2 - ((idxNum & 4) != 0)); + pIdxInfo->estimatedRows = 1000; + if (pIdxInfo->nOrderBy == 1) { + if (pIdxInfo->aOrderBy[0].desc) { + idxNum |= 8; + } else { + idxNum |= 16; + } + pIdxInfo->orderByConsumed = 1; + } + } else { + /* If either boundary is missing, we have to generate a huge span + ** of numbers. Make this case very expensive so that the query + ** planner will work hard to avoid it. */ + pIdxInfo->estimatedRows = 2147483647; + } + pIdxInfo->idxNum = idxNum; + return SQLITE_OK; +} + +/* +** This following structure defines all the methods for the +** generate_series virtual table. +*/ +static sqlite3_module series_module = { + .xConnect = seriesConnect, + .xBestIndex = seriesBestIndex, + .xDisconnect = seriesDisconnect, + .xOpen = seriesOpen, + .xClose = seriesClose, + .xFilter = seriesFilter, + .xNext = seriesNext, + .xEof = seriesEof, + .xColumn = seriesColumn, + .xRowid = seriesRowid, +}; + +int stats_series_init(sqlite3* db) { + sqlite3_create_module(db, "stats_seq", &series_module, 0); + sqlite3_create_module(db, "generate_series", &series_module, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/stats/stats.h b/libsql-ffi/bundled/sqlean/stats/stats.h new file mode 100644 index 0000000000..dd7e6826cb --- /dev/null +++ b/libsql-ffi/bundled/sqlean/stats/stats.h @@ -0,0 +1,14 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Statistical functions for SQLite. + +#ifndef STATS_INTERNAL_H +#define STATS_INTERNAL_H + +#include "sqlite3ext.h" + +int stats_scalar_init(sqlite3* db); +int stats_series_init(sqlite3* db); + +#endif /* STATS_INTERNAL_H */ diff --git a/libsql-ffi/bundled/sqlean/text/bstring.c b/libsql-ffi/bundled/sqlean/text/bstring.c new file mode 100644 index 0000000000..1bada067bc --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/bstring.c @@ -0,0 +1,488 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Byte string data structure. + +#include +#include +#include +#include +#include +#include +#include + +#include "text/bstring.h" + +// bstring_new creates an empty string. +ByteString bstring_new(void) { + char* bytes = "\0"; + ByteString str = {.bytes = bytes, .length = 0, .owning = false}; + return str; +} + +// bstring_from_cstring creates a new string that wraps an existing C string. +ByteString bstring_from_cstring(const char* const cstring, size_t length) { + ByteString str = {.bytes = cstring, .length = length, .owning = false}; + return str; +} + +// bstring_clone creates a new string by copying an existing C string. +static ByteString bstring_clone(const char* const cstring, size_t length) { + char* bytes = calloc(length + 1, sizeof(char)); + if (bytes == NULL) { + ByteString str = {NULL, 0, true}; + return str; + } + memcpy(bytes, cstring, length); + ByteString str = {bytes, length, true}; + return str; +} + +// bstring_to_cstring converts the string to a zero-terminated C string. +const char* bstring_to_cstring(ByteString str) { + if (str.bytes == NULL) { + return NULL; + } + return str.bytes; +} + +// bstring_free destroys the string, freeing resources if necessary. +void bstring_free(ByteString str) { + if (str.owning && str.bytes != NULL) { + free((void*)str.bytes); + } +} + +// bstring_at returns a character by its index in the string. +char bstring_at(ByteString str, size_t idx) { + if (str.length == 0) { + return 0; + } + if (idx < 0 || idx >= str.length) { + return 0; + }; + return str.bytes[idx]; +} + +// bstring_slice returns a slice of the string, +// from the `start` index (inclusive) to the `end` index (non-inclusive). +// Negative `start` and `end` values count from the end of the string. +ByteString bstring_slice(ByteString str, int start, int end) { + if (str.length == 0) { + return bstring_new(); + } + + // adjusted start index + start = start < 0 ? (int)str.length + start : start; + // python-compatible: treat negative start index larger than the length of the string as zero + start = start < 0 ? 0 : start; + // adjusted start index should be less the the length of the string + if (start >= (int)str.length) { + return bstring_new(); + } + + // adjusted end index + end = end < 0 ? (int)str.length + end : end; + // python-compatible: treat end index larger than the length of the string + // as equal to the length + end = end > (int)str.length ? (int)str.length : end; + // adjusted end index should be >= 0 + if (end < 0) { + return bstring_new(); + } + + // adjusted start index should be less than adjusted end index + if (start >= end) { + return bstring_new(); + } + + char* at = (char*)str.bytes + start; + size_t length = end - start; + ByteString slice = bstring_clone(at, length); + return slice; +} + +// bstring_substring returns a substring of `length` characters, +// starting from the `start` index. +ByteString bstring_substring(ByteString str, size_t start, size_t length) { + if (length > str.length - start) { + length = str.length - start; + } + return bstring_slice(str, start, start + length); +} + +// bstring_contains_after checks if the other string is a substring of the original string, +// starting at the `start` index. +static bool bstring_contains_after(ByteString str, ByteString other, size_t start) { + if (start + other.length > str.length) { + return false; + } + for (size_t idx = 0; idx < other.length; idx++) { + if (str.bytes[start + idx] != other.bytes[idx]) { + return false; + } + } + return true; +} + +// bstring_index_char returns the first index of the character in the string +// after the `start` index, inclusive. +static int bstring_index_char(ByteString str, char chr, size_t start) { + for (size_t idx = start; idx < str.length; idx++) { + if (str.bytes[idx] == chr) { + return idx; + } + } + return -1; +} + +// bstring_last_index_char returns the last index of the character in the string +// before the `end` index, inclusive. +static int bstring_last_index_char(ByteString str, char chr, size_t end) { + if (end >= str.length) { + return -1; + } + for (int idx = end; idx >= 0; idx--) { + if (str.bytes[idx] == chr) { + return idx; + } + } + return -1; +} + +// bstring_index_after returns the index of the substring in the original string +// after the `start` index, inclusive. +static int bstring_index_after(ByteString str, ByteString other, size_t start) { + if (other.length == 0) { + return start; + } + if (str.length == 0 || other.length > str.length) { + return -1; + } + + size_t cur_idx = start; + while (cur_idx < str.length) { + int match_idx = bstring_index_char(str, other.bytes[0], cur_idx); + if (match_idx == -1) { + return match_idx; + } + if (bstring_contains_after(str, other, match_idx)) { + return match_idx; + } + cur_idx = match_idx + 1; + } + return -1; +} + +// bstring_index returns the first index of the substring in the original string. +int bstring_index(ByteString str, ByteString other) { + return bstring_index_after(str, other, 0); +} + +// bstring_last_index returns the last index of the substring in the original string. +int bstring_last_index(ByteString str, ByteString other) { + if (other.length == 0) { + return str.length - 1; + } + if (str.length == 0 || other.length > str.length) { + return -1; + } + + int cur_idx = str.length - 1; + while (cur_idx >= 0) { + int match_idx = bstring_last_index_char(str, other.bytes[0], cur_idx); + if (match_idx == -1) { + return match_idx; + } + if (bstring_contains_after(str, other, match_idx)) { + return match_idx; + } + cur_idx = match_idx - 1; + } + + return -1; +} + +// bstring_contains checks if the string contains the substring. +bool bstring_contains(ByteString str, ByteString other) { + return bstring_index(str, other) != -1; +} + +// bstring_equals checks if two strings are equal character by character. +bool bstring_equals(ByteString str, ByteString other) { + if (str.bytes == NULL && other.bytes == NULL) { + return true; + } + if (str.bytes == NULL || other.bytes == NULL) { + return false; + } + if (str.length != other.length) { + return false; + } + return bstring_contains_after(str, other, 0); +} + +// bstring_has_prefix checks if the string starts with the `other` substring. +bool bstring_has_prefix(ByteString str, ByteString other) { + return bstring_index(str, other) == 0; +} + +// bstring_has_suffix checks if the string ends with the `other` substring. +bool bstring_has_suffix(ByteString str, ByteString other) { + if (other.length == 0) { + return true; + } + int idx = bstring_last_index(str, other); + return idx < 0 ? false : (size_t)idx == (str.length - other.length); +} + +// bstring_count counts how many times the `other` substring is contained in the original string. +size_t bstring_count(ByteString str, ByteString other) { + if (str.length == 0 || other.length == 0 || other.length > str.length) { + return 0; + } + + size_t count = 0; + size_t char_idx = 0; + while (char_idx < str.length) { + int match_idx = bstring_index_after(str, other, char_idx); + if (match_idx == -1) { + break; + } + count += 1; + char_idx = match_idx + other.length; + } + + return count; +} + +// bstring_split_part splits the string by the separator and returns the nth part (0-based). +ByteString bstring_split_part(ByteString str, ByteString sep, size_t part) { + if (str.length == 0 || sep.length > str.length) { + return bstring_new(); + } + if (sep.length == 0) { + if (part == 0) { + return bstring_slice(str, 0, str.length); + } else { + return bstring_new(); + } + } + + size_t found = 0; + size_t prev_idx = 0; + size_t char_idx = 0; + while (char_idx < str.length) { + int match_idx = bstring_index_after(str, sep, char_idx); + if (match_idx == -1) { + break; + } + if (found == part) { + return bstring_slice(str, prev_idx, match_idx); + } + found += 1; + prev_idx = match_idx + sep.length; + char_idx = match_idx + sep.length; + } + + if (found == part) { + return bstring_slice(str, prev_idx, str.length); + } + + return bstring_new(); +} + +// bstring_join joins strings using the separator and returns the resulting string. +ByteString bstring_join(ByteString* strings, size_t count, ByteString sep) { + // calculate total string length + size_t total_length = 0; + for (size_t idx = 0; idx < count; idx++) { + ByteString str = strings[idx]; + total_length += str.length; + // no separator after the last one + if (idx != count - 1) { + total_length += sep.length; + } + } + + // allocate memory for the bytes + size_t total_size = total_length * sizeof(char); + char* bytes = malloc(total_size + 1); + if (bytes == NULL) { + ByteString str = {NULL, 0, false}; + return str; + } + + // copy bytes from each string with separator in between + char* at = bytes; + for (size_t idx = 0; idx < count; idx++) { + ByteString str = strings[idx]; + memcpy(at, str.bytes, str.length); + at += str.length; + if (idx != count - 1 && sep.length != 0) { + memcpy(at, sep.bytes, sep.length); + at += sep.length; + } + } + + bytes[total_length] = '\0'; + ByteString str = {bytes, total_length, true}; + return str; +} + +// bstring_concat concatenates strings and returns the resulting string. +ByteString bstring_concat(ByteString* strings, size_t count) { + ByteString sep = bstring_new(); + return bstring_join(strings, count, sep); +} + +// bstring_repeat concatenates the string to itself a given number of times +// and returns the resulting string. +ByteString bstring_repeat(ByteString str, size_t count) { + // calculate total string length + size_t total_length = str.length * count; + + // allocate memory for the bytes + size_t total_size = total_length * sizeof(char); + char* bytes = malloc(total_size + 1); + if (bytes == NULL) { + ByteString res = {NULL, 0, false}; + return res; + } + + // copy bytes + char* at = bytes; + for (size_t idx = 0; idx < count; idx++) { + memcpy(at, str.bytes, str.length); + at += str.length; + } + + bytes[total_size] = '\0'; + ByteString res = {bytes, total_length, true}; + return res; +} + +// bstring_replace replaces the `old` substring with the `new` substring in the original string, +// but not more than `max_count` times. +ByteString bstring_replace(ByteString str, ByteString old, ByteString new, size_t max_count) { + // count matches of the old string in the source string + size_t count = bstring_count(str, old); + if (count == 0) { + return bstring_slice(str, 0, str.length); + } + + // limit the number of replacements + if (max_count >= 0 && count > max_count) { + count = max_count; + } + + // k matches split string into (k+1) parts + // allocate an array for them + size_t parts_count = count + 1; + ByteString* strings = malloc(parts_count * sizeof(ByteString)); + if (strings == NULL) { + ByteString res = {NULL, 0, false}; + return res; + } + + // split the source string where it matches the old string + // and fill the strings array with these parts + size_t part_idx = 0; + size_t char_idx = 0; + while (char_idx < str.length && part_idx < count) { + int match_idx = bstring_index_after(str, old, char_idx); + if (match_idx == -1) { + break; + } + // slice from the prevoius match to the current match + strings[part_idx] = bstring_slice(str, char_idx, match_idx); + part_idx += 1; + char_idx = match_idx + old.length; + } + // "tail" from the last match to the end of the source string + strings[part_idx] = bstring_slice(str, char_idx, str.length); + + // join all the parts using new string as a separator + ByteString res = bstring_join(strings, parts_count, new); + // free string parts + for (size_t idx = 0; idx < parts_count; idx++) { + bstring_free(strings[idx]); + } + free(strings); + return res; +} + +// bstring_replace_all replaces all `old` substrings with the `new` substrings +// in the original string. +ByteString bstring_replace_all(ByteString str, ByteString old, ByteString new) { + return bstring_replace(str, old, new, -1); +} + +// bstring_reverse returns the reversed string. +ByteString bstring_reverse(ByteString str) { + ByteString res = bstring_clone(str.bytes, str.length); + char* bytes = (char*)res.bytes; + for (size_t i = 0; i < str.length / 2; i++) { + char r = bytes[i]; + bytes[i] = bytes[str.length - 1 - i]; + bytes[str.length - 1 - i] = r; + } + return res; +} + +// bstring_trim_left trims whitespaces from the beginning of the string. +ByteString bstring_trim_left(ByteString str) { + if (str.length == 0) { + return bstring_new(); + } + size_t idx = 0; + for (; idx < str.length; idx++) { + if (!isspace(str.bytes[idx])) { + break; + } + } + return bstring_slice(str, idx, str.length); +} + +// bstring_trim_right trims whitespaces from the end of the string. +ByteString bstring_trim_right(ByteString str) { + if (str.length == 0) { + return bstring_new(); + } + size_t idx = str.length - 1; + for (; idx >= 0; idx--) { + if (!isspace(str.bytes[idx])) { + break; + } + } + return bstring_slice(str, 0, idx + 1); +} + +// bstring_trim trims whitespaces from the beginning and end of the string. +ByteString bstring_trim(ByteString str) { + if (str.length == 0) { + return bstring_new(); + } + size_t left = 0; + for (; left < str.length; left++) { + if (!isspace(str.bytes[left])) { + break; + } + } + size_t right = str.length - 1; + for (; right >= 0; right--) { + if (!isspace(str.bytes[right])) { + break; + } + } + return bstring_slice(str, left, right + 1); +} + +// bstring_print prints the string to stdout. +void bstring_print(ByteString str) { + if (str.bytes == NULL) { + printf("\n"); + return; + } + printf("'%s' (len=%zu)\n", str.bytes, str.length); +} diff --git a/libsql-ffi/bundled/sqlean/text/bstring.h b/libsql-ffi/bundled/sqlean/text/bstring.h new file mode 100644 index 0000000000..40b681291d --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/bstring.h @@ -0,0 +1,57 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Byte string data structure. + +#ifndef BSTRING_H +#define BSTRING_H + +#include +#include +#include + +// ByteString is a string composed of bytes. +typedef struct { + // array of bytes + const char* bytes; + // number of bytes in the string + size_t length; + // indicates whether the string owns the array + // and should free the memory when destroyed + bool owning; +} ByteString; + +// ByteString methods. +ByteString bstring_new(void); +ByteString bstring_from_cstring(const char* const cstring, size_t length); +const char* bstring_to_cstring(ByteString str); +void bstring_free(ByteString str); + +char bstring_at(ByteString str, size_t idx); +ByteString bstring_slice(ByteString str, int start, int end); +ByteString bstring_substring(ByteString str, size_t start, size_t length); + +int bstring_index(ByteString str, ByteString other); +int bstring_last_index(ByteString str, ByteString other); +bool bstring_contains(ByteString str, ByteString other); +bool bstring_equals(ByteString str, ByteString other); +bool bstring_has_prefix(ByteString str, ByteString other); +bool bstring_has_suffix(ByteString str, ByteString other); +size_t bstring_count(ByteString str, ByteString other); + +ByteString bstring_split_part(ByteString str, ByteString sep, size_t part); +ByteString bstring_join(ByteString* strings, size_t count, ByteString sep); +ByteString bstring_concat(ByteString* strings, size_t count); +ByteString bstring_repeat(ByteString str, size_t count); + +ByteString bstring_replace(ByteString str, ByteString old, ByteString new, size_t max_count); +ByteString bstring_replace_all(ByteString str, ByteString old, ByteString new); +ByteString bstring_reverse(ByteString str); + +ByteString bstring_trim_left(ByteString str); +ByteString bstring_trim_right(ByteString str); +ByteString bstring_trim(ByteString str); + +void bstring_print(ByteString str); + +#endif /* BSTRING_H */ diff --git a/libsql-ffi/bundled/sqlean/text/extension.c b/libsql-ffi/bundled/sqlean/text/extension.c new file mode 100644 index 0000000000..c7dbd3d920 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/extension.c @@ -0,0 +1,987 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with text. + +#include +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "text/bstring.h" +#include "text/rstring.h" +#include "text/utf8/utf8.h" + +#pragma region Substrings + +// Extracts a substring starting at the `start` position (1-based). +// text_substring(str, start) +// [pg-compatible] substr(string, start) +static void text_substring2(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "start parameter should be integer", -1); + return; + } + int start = sqlite3_value_int(argv[1]); + + // convert to 0-based index + // postgres-compatible: treat negative index as zero + start = start > 0 ? start - 1 : 0; + + RuneString s_src = rstring_from_cstring(src); + RuneString s_res = rstring_slice(s_src, start, s_src.length); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +// Extracts a substring of `length` characters starting at the `start` position (1-based). +// text_substring(str, start, length) +// [pg-compatible] substr(string, start, count) +static void text_substring3(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 3); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "start parameter should be integer", -1); + return; + } + int start = sqlite3_value_int(argv[1]); + + if (sqlite3_value_type(argv[2]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "length parameter should be integer", -1); + return; + } + int length = sqlite3_value_int(argv[2]); + if (length < 0) { + sqlite3_result_error(context, "length parameter should >= 0", -1); + return; + } + + // convert to 0-based index + start -= 1; + // postgres-compatible: treat negative start as 0, but shorten the length accordingly + if (start < 0) { + length += start; + start = 0; + } + + // zero-length substring + if (length <= 0) { + sqlite3_result_text(context, "", -1, SQLITE_TRANSIENT); + return; + } + + RuneString s_src = rstring_from_cstring(src); + + // postgres-compatible: the substring cannot be longer the the original string + if ((size_t)length > s_src.length) { + length = s_src.length; + } + + RuneString s_res = rstring_substring(s_src, start, length); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +// Extracts a substring starting at the `start` position (1-based). +// text_slice(str, start) +static void text_slice2(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "start parameter should be integer", -1); + return; + } + int start = sqlite3_value_int(argv[1]); + + // convert to 0-based index + start = start > 0 ? start - 1 : start; + + RuneString s_src = rstring_from_cstring(src); + + // python-compatible: treat negative index larger than the length of the string as zero + // and return the original string + if (start < -(int)s_src.length) { + sqlite3_result_text(context, src, -1, SQLITE_TRANSIENT); + rstring_free(s_src); + return; + } + + RuneString s_res = rstring_slice(s_src, start, s_src.length); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +// Extracts a substring from `start` position inclusive to `end` position non-inclusive (1-based). +// text_slice(str, start, end) +static void text_slice3(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 3); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "start parameter should be integer", -1); + return; + } + int start = sqlite3_value_int(argv[1]); + // convert to 0-based index + start = start > 0 ? start - 1 : start; + + if (sqlite3_value_type(argv[2]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "end parameter should be integer", -1); + return; + } + int end = sqlite3_value_int(argv[2]); + // convert to 0-based index + end = end > 0 ? end - 1 : end; + + RuneString s_src = rstring_from_cstring(src); + RuneString s_res = rstring_slice(s_src, start, end); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +// Extracts a substring of `length` characters from the beginning of the string. +// For `length < 0`, extracts all but the last `|length|` characters. +// text_left(str, length) +// [pg-compatible] left(string, n) +static void text_left(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "length parameter should be integer", -1); + return; + } + int length = sqlite3_value_int(argv[1]); + + RuneString s_src = rstring_from_cstring(src); + if (length < 0) { + length = s_src.length + length; + length = length >= 0 ? length : 0; + } + RuneString s_res = rstring_substring(s_src, 0, length); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +// Extracts a substring of `length` characters from the end of the string. +// For `length < 0`, extracts all but the first `|length|` characters. +// text_right(str, length) +// [pg-compatible] right(string, n) +static void text_right(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "length parameter should be integer", -1); + return; + } + int length = sqlite3_value_int(argv[1]); + + RuneString s_src = rstring_from_cstring(src); + + length = (length < 0) ? (int)s_src.length + length : length; + int start = (int)s_src.length - length; + start = start < 0 ? 0 : start; + + RuneString s_res = rstring_substring(s_src, start, length); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +#pragma endregion + +#pragma region Search and match + +// Returns the first index of the substring in the original string. +// text_index(str, other) +// [pg-compatible] strpos(string, substring) +static void text_index(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* other = (char*)sqlite3_value_text(argv[1]); + if (other == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString s_src = rstring_from_cstring(src); + RuneString s_other = rstring_from_cstring(other); + int idx = rstring_index(s_src, s_other); + sqlite3_result_int64(context, idx + 1); + rstring_free(s_src); + rstring_free(s_other); +} + +// Returns the last index of the substring in the original string. +// text_last_index(str, other) +static void text_last_index(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* other = (char*)sqlite3_value_text(argv[1]); + if (other == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString s_src = rstring_from_cstring(src); + RuneString s_other = rstring_from_cstring(other); + int idx = rstring_last_index(s_src, s_other); + sqlite3_result_int64(context, idx + 1); + rstring_free(s_src); + rstring_free(s_other); +} + +// Checks if the string contains the substring_ +// text_contains(str, other) +static void text_contains(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* other = (char*)sqlite3_value_text(argv[1]); + if (other == NULL) { + sqlite3_result_null(context); + return; + } + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_other = bstring_from_cstring(other, sqlite3_value_bytes(argv[1])); + bool found = bstring_contains(s_src, s_other); + sqlite3_result_int(context, found); + bstring_free(s_src); + bstring_free(s_other); +} + +// Checks if the string starts with the substring_ +// text_has_prefix(str, other) +// [pg-compatible] starts_with(string, prefix) +static void text_has_prefix(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* other = (char*)sqlite3_value_text(argv[1]); + if (other == NULL) { + sqlite3_result_null(context); + return; + } + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_other = bstring_from_cstring(other, sqlite3_value_bytes(argv[1])); + bool found = bstring_has_prefix(s_src, s_other); + sqlite3_result_int(context, found); + bstring_free(s_src); + bstring_free(s_other); +} + +// Checks if the string ends with the substring_ +// text_has_suffix(str, other) +static void text_has_suffix(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* other = (char*)sqlite3_value_text(argv[1]); + if (other == NULL) { + sqlite3_result_null(context); + return; + } + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_other = bstring_from_cstring(other, sqlite3_value_bytes(argv[1])); + bool found = bstring_has_suffix(s_src, s_other); + sqlite3_result_int(context, found); + bstring_free(s_src); + bstring_free(s_other); +} + +// Counts how many times the substring is contained in the original string. +// text_count(str, other) +static void text_count(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* other = (char*)sqlite3_value_text(argv[1]); + if (other == NULL) { + sqlite3_result_null(context); + return; + } + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_other = bstring_from_cstring(other, sqlite3_value_bytes(argv[1])); + size_t count = bstring_count(s_src, s_other); + sqlite3_result_int(context, count); + bstring_free(s_src); + bstring_free(s_other); +} + +// Checks if the string matches the pattern using the SQL LIKE syntax. +// text_like(pattern, str) +// like(pattern, str) +// str LIKE pattern +static void text_like(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* pattern = (char*)sqlite3_value_text(argv[0]); + if (pattern == NULL) { + sqlite3_result_null(context); + return; + } + + const char* str = (char*)sqlite3_value_text(argv[1]); + if (str == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString s_pattern = rstring_from_cstring(pattern); + RuneString s_str = rstring_from_cstring(str); + bool match = rstring_like(s_pattern, s_str); + sqlite3_result_int(context, match); + rstring_free(s_pattern); + rstring_free(s_str); +} + +#pragma endregion + +#pragma region Split and join + +// Splits a string by a separator and returns the n-th part (counting from one). +// When n is negative, returns the |n|'th-from-last part. +// text_split(str, sep, n) +// [pg-compatible] split_part(string, delimiter, n) +static void text_split(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 3); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* sep = (const char*)sqlite3_value_text(argv[1]); + if (sep == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[2]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "part parameter should be integer", -1); + return; + } + int part = sqlite3_value_int(argv[2]); + // pg-compatible + if (part == 0) { + sqlite3_result_error(context, "part parameter should not be 0", -1); + return; + } + // convert to 0-based index + part = part > 0 ? part - 1 : part; + + ByteString s_src = bstring_from_cstring(src, strlen(src)); + ByteString s_sep = bstring_from_cstring(sep, strlen(sep)); + + // count from the last part backwards + if (part < 0) { + int n_parts = bstring_count(s_src, s_sep) + 1; + part = n_parts + part; + } + + ByteString s_part = bstring_split_part(s_src, s_sep, part); + sqlite3_result_text(context, s_part.bytes, -1, SQLITE_TRANSIENT); + bstring_free(s_src); + bstring_free(s_sep); + bstring_free(s_part); +} + +// Joins strings using the separator and returns the resulting string. Ignores nulls. +// text_join(sep, str, ...) +// [pg-compatible] concat_ws(sep, val1[, val2 [, ...]]) +static void text_join(sqlite3_context* context, int argc, sqlite3_value** argv) { + if (argc < 2) { + sqlite3_result_error(context, "expected at least 2 parameters", -1); + return; + } + + // separator + const char* sep = (char*)sqlite3_value_text(argv[0]); + if (sep == NULL) { + sqlite3_result_null(context); + return; + } + ByteString s_sep = bstring_from_cstring(sep, sqlite3_value_bytes(argv[0])); + + // parts + size_t n_parts = argc - 1; + ByteString* s_parts = malloc(n_parts * sizeof(ByteString)); + if (s_parts == NULL) { + sqlite3_result_null(context); + return; + } + for (size_t i = 1, part_idx = 0; i < (size_t)argc; i++) { + if (sqlite3_value_type(argv[i]) == SQLITE_NULL) { + // ignore nulls + n_parts--; + continue; + } + const char* part = (char*)sqlite3_value_text(argv[i]); + int part_len = sqlite3_value_bytes(argv[i]); + s_parts[part_idx] = bstring_from_cstring(part, part_len); + part_idx++; + } + + // join parts with separator + ByteString s_res = bstring_join(s_parts, n_parts, s_sep); + const char* res = bstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, SQLITE_TRANSIENT); + bstring_free(s_sep); + bstring_free(s_res); + free(s_parts); +} + +// Concatenates strings and returns the resulting string. Ignores nulls. +// text_concat(str, ...) +// [pg-compatible] concat(val1[, val2 [, ...]]) +static void text_concat(sqlite3_context* context, int argc, sqlite3_value** argv) { + if (argc < 1) { + sqlite3_result_error(context, "expected at least 1 parameter", -1); + return; + } + + // parts + size_t n_parts = argc; + ByteString* s_parts = malloc(n_parts * sizeof(ByteString)); + if (s_parts == NULL) { + sqlite3_result_null(context); + return; + } + for (size_t i = 0, part_idx = 0; i < (size_t)argc; i++) { + if (sqlite3_value_type(argv[i]) == SQLITE_NULL) { + // ignore nulls + n_parts--; + continue; + } + const char* part = (char*)sqlite3_value_text(argv[i]); + int part_len = sqlite3_value_bytes(argv[i]); + s_parts[part_idx] = bstring_from_cstring(part, part_len); + part_idx++; + } + + // join parts + ByteString s_res = bstring_concat(s_parts, n_parts); + const char* res = bstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, SQLITE_TRANSIENT); + bstring_free(s_res); + free(s_parts); +} + +// Concatenates the string to itself a given number of times and returns the resulting string. +// text_repeat(str, count) +// [pg-compatible] repeat(string, number) +static void text_repeat(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "count parameter should be integer", -1); + return; + } + int count = sqlite3_value_int(argv[1]); + // pg-compatible: treat negative count as zero + count = count >= 0 ? count : 0; + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_res = bstring_repeat(s_src, count); + const char* res = bstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, SQLITE_TRANSIENT); + bstring_free(s_src); + bstring_free(s_res); +} + +#pragma endregion + +#pragma region Trim and pad + +// Trims certain characters (spaces by default) from the beginning/end of the string. +// text_ltrim(str [,chars]) +// text_rtrim(str [,chars]) +// text_trim(str [,chars]) +// [pg-compatible] ltrim(string [, characters]) +// [pg-compatible] rtrim(string [, characters]) +// [pg-compatible] btrim(string [, characters]) +static void text_trim(sqlite3_context* context, int argc, sqlite3_value** argv) { + if (argc != 1 && argc != 2) { + sqlite3_result_error(context, "expected 1 or 2 parameters", -1); + return; + } + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* chars = argc == 2 ? (char*)sqlite3_value_text(argv[1]) : " "; + if (chars == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString (*trim_func)(RuneString, RuneString) = (void*)sqlite3_user_data(context); + + RuneString s_src = rstring_from_cstring(src); + RuneString s_chars = rstring_from_cstring(chars); + RuneString s_res = trim_func(s_src, s_chars); + const char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_chars); + rstring_free(s_res); +} + +// Pads the string to the specified length by prepending/appending certain characters +// (spaces by default). +// text_lpad(str, length [,fill]) +// text_rpad(str, length [,fill]) +// [pg-compatible] lpad(string, length [, fill]) +// [pg-compatible] rpad(string, length [, fill]) +// (!) postgres does not support unicode strings in lpad/rpad, while this function does. +static void text_pad(sqlite3_context* context, int argc, sqlite3_value** argv) { + if (argc != 2 && argc != 3) { + sqlite3_result_error(context, "expected 2 or 3 parameters", -1); + return; + } + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "length parameter should be integer", -1); + return; + } + int length = sqlite3_value_int(argv[1]); + // postgres-compatible: treat negative length as zero + length = length < 0 ? 0 : length; + + const char* fill = argc == 3 ? (char*)sqlite3_value_text(argv[2]) : " "; + if (fill == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString (*pad_func)(RuneString, size_t, RuneString) = (void*)sqlite3_user_data(context); + + RuneString s_src = rstring_from_cstring(src); + RuneString s_fill = rstring_from_cstring(fill); + RuneString s_res = pad_func(s_src, length, s_fill); + const char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_fill); + rstring_free(s_res); +} + +#pragma endregion + +#pragma region Change case + +// Changes the case of the string. +// text_upper(str) +// text_lower(str) +// text_title(str) +// text_casefold(str) +static void text_change_case(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + + const char* src = (const char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + size_t n = sqlite3_value_bytes(argv[0]); + + char* res = malloc(n + 1); + if (res == NULL) { + sqlite3_result_error_nomem(context); + return; + } + memcpy(res, src, n); + res[n] = '\0'; + + bool (*fn)(char*, size_t) = sqlite3_user_data(context); + fn(res, n); + + sqlite3_result_text(context, res, n, free); +} + +#pragma endregion + +#pragma region Other modifications + +// Replaces all old substrings with new substrings in the original string. +// text_replace(str, old, new) +// [pg-compatible] replace(string, from, to) +static void text_replace_all(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 3); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* old = (char*)sqlite3_value_text(argv[1]); + if (old == NULL) { + sqlite3_result_null(context); + return; + } + + const char* new = (char*)sqlite3_value_text(argv[2]); + if (new == NULL) { + sqlite3_result_null(context); + return; + } + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_old = bstring_from_cstring(old, sqlite3_value_bytes(argv[1])); + ByteString s_new = bstring_from_cstring(new, sqlite3_value_bytes(argv[2])); + ByteString s_res = bstring_replace_all(s_src, s_old, s_new); + const char* res = bstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, SQLITE_TRANSIENT); + bstring_free(s_src); + bstring_free(s_old); + bstring_free(s_new); + bstring_free(s_res); +} + +// Replaces old substrings with new substrings in the original string, +// but not more than `count` times. +// text_replace(str, old, new, count) +static void text_replace(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 4); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* old = (char*)sqlite3_value_text(argv[1]); + if (old == NULL) { + sqlite3_result_null(context); + return; + } + + const char* new = (char*)sqlite3_value_text(argv[2]); + if (new == NULL) { + sqlite3_result_null(context); + return; + } + + if (sqlite3_value_type(argv[3]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "count parameter should be integer", -1); + return; + } + int count = sqlite3_value_int(argv[3]); + // treat negative count as zero + count = count < 0 ? 0 : count; + + ByteString s_src = bstring_from_cstring(src, sqlite3_value_bytes(argv[0])); + ByteString s_old = bstring_from_cstring(old, sqlite3_value_bytes(argv[1])); + ByteString s_new = bstring_from_cstring(new, sqlite3_value_bytes(argv[2])); + ByteString s_res = bstring_replace(s_src, s_old, s_new, count); + const char* res = bstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, SQLITE_TRANSIENT); + bstring_free(s_src); + bstring_free(s_old); + bstring_free(s_new); + bstring_free(s_res); +} + +// Replaces each string character that matches a character in the `from` set +// with the corresponding character in the `to` set. If `from` is longer than `to`, +// occurrences of the extra characters in `from` are deleted. +// text_translate(str, from, to) +// [pg-compatible] translate(string, from, to) +// (!) postgres does not support unicode strings in translate, while this function does. +static void text_translate(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 3); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + const char* from = (char*)sqlite3_value_text(argv[1]); + if (from == NULL) { + sqlite3_result_null(context); + return; + } + + const char* to = (char*)sqlite3_value_text(argv[2]); + if (to == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString s_src = rstring_from_cstring(src); + RuneString s_from = rstring_from_cstring(from); + RuneString s_to = rstring_from_cstring(to); + RuneString s_res = rstring_translate(s_src, s_from, s_to); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_from); + rstring_free(s_to); + rstring_free(s_res); +} + +// Reverses the order of the characters in the string. +// text_reverse(str) +// [pg-compatible] reverse(text) +// (!) postgres does not support unicode strings in reverse, while this function does. +static void text_reverse(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString s_src = rstring_from_cstring(src); + RuneString s_res = rstring_reverse(s_src); + char* res = rstring_to_cstring(s_res); + sqlite3_result_text(context, res, -1, free); + rstring_free(s_src); + rstring_free(s_res); +} + +#pragma endregion + +#pragma region Properties + +// Returns the number of characters in the string. +// text_length(str) +// [pg-compatible] length(text) +// [pg-compatible] char_length(text) +// [pg-compatible] character_length(text) +static void text_length(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + RuneString s_src = rstring_from_cstring(src); + sqlite3_result_int64(context, s_src.length); + rstring_free(s_src); +} + +// Returns the number of bytes in the string. +// text_size(str) +// [pg-compatible] octet_length(text) +static void text_size(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + sqlite3_result_int64(context, sqlite3_value_bytes(argv[0])); +} + +// Returns the number of bits in the string. +// text_bitsize(str) +// [pg-compatible] bit_length(text) +static void text_bit_size(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + + const char* src = (char*)sqlite3_value_text(argv[0]); + if (src == NULL) { + sqlite3_result_null(context); + return; + } + + int size = sqlite3_value_bytes(argv[0]); + sqlite3_result_int64(context, 8 * size); +} + +#pragma endregion + +#pragma region Collation + +static int collate_nocase(void* unused, int n1, const void* s1, int n2, const void* s2) { + (void)unused; + return utf8_icmp((const char*)s1, (size_t)n1, (const char*)s2, (size_t)n2); +} + +#pragma endregion + +int text_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + + // substrings + sqlite3_create_function(db, "text_substring", 2, flags, 0, text_substring2, 0, 0); + sqlite3_create_function(db, "text_substring", 3, flags, 0, text_substring3, 0, 0); + sqlite3_create_function(db, "text_slice", 2, flags, 0, text_slice2, 0, 0); + sqlite3_create_function(db, "text_slice", 3, flags, 0, text_slice3, 0, 0); + sqlite3_create_function(db, "text_left", 2, flags, 0, text_left, 0, 0); + sqlite3_create_function(db, "left", 2, flags, 0, text_left, 0, 0); + sqlite3_create_function(db, "text_right", 2, flags, 0, text_right, 0, 0); + sqlite3_create_function(db, "right", 2, flags, 0, text_right, 0, 0); + + // search and match + sqlite3_create_function(db, "text_index", 2, flags, 0, text_index, 0, 0); + sqlite3_create_function(db, "strpos", 2, flags, 0, text_index, 0, 0); + sqlite3_create_function(db, "text_last_index", 2, flags, 0, text_last_index, 0, 0); + sqlite3_create_function(db, "text_contains", 2, flags, 0, text_contains, 0, 0); + sqlite3_create_function(db, "text_has_prefix", 2, flags, 0, text_has_prefix, 0, 0); + sqlite3_create_function(db, "starts_with", 2, flags, 0, text_has_prefix, 0, 0); + sqlite3_create_function(db, "text_has_suffix", 2, flags, 0, text_has_suffix, 0, 0); + sqlite3_create_function(db, "text_count", 2, flags, 0, text_count, 0, 0); + sqlite3_create_function(db, "text_like", 2, flags, 0, text_like, 0, 0); + + // split and join + sqlite3_create_function(db, "text_split", 3, flags, 0, text_split, 0, 0); + sqlite3_create_function(db, "split_part", 3, flags, 0, text_split, 0, 0); + sqlite3_create_function(db, "text_join", -1, flags, 0, text_join, 0, 0); + sqlite3_create_function(db, "concat_ws", -1, flags, 0, text_join, 0, 0); + sqlite3_create_function(db, "text_concat", -1, flags, 0, text_concat, 0, 0); + sqlite3_create_function(db, "concat", -1, flags, 0, text_concat, 0, 0); + sqlite3_create_function(db, "text_repeat", 2, flags, 0, text_repeat, 0, 0); + sqlite3_create_function(db, "repeat", 2, flags, 0, text_repeat, 0, 0); + + // trim and pad + sqlite3_create_function(db, "text_ltrim", -1, flags, rstring_trim_left, text_trim, 0, 0); + sqlite3_create_function(db, "ltrim", -1, flags, rstring_trim_left, text_trim, 0, 0); + sqlite3_create_function(db, "text_rtrim", -1, flags, rstring_trim_right, text_trim, 0, 0); + sqlite3_create_function(db, "rtrim", -1, flags, rstring_trim_right, text_trim, 0, 0); + sqlite3_create_function(db, "text_trim", -1, flags, rstring_trim, text_trim, 0, 0); + sqlite3_create_function(db, "btrim", -1, flags, rstring_trim, text_trim, 0, 0); + sqlite3_create_function(db, "text_lpad", -1, flags, rstring_pad_left, text_pad, 0, 0); + sqlite3_create_function(db, "lpad", -1, flags, rstring_pad_left, text_pad, 0, 0); + sqlite3_create_function(db, "text_rpad", -1, flags, rstring_pad_right, text_pad, 0, 0); + sqlite3_create_function(db, "rpad", -1, flags, rstring_pad_right, text_pad, 0, 0); + + // change case + sqlite3_create_function(db, "text_upper", 1, flags, utf8_toupper, text_change_case, 0, 0); + sqlite3_create_function(db, "text_lower", 1, flags, utf8_tolower, text_change_case, 0, 0); + sqlite3_create_function(db, "text_title", 1, flags, utf8_totitle, text_change_case, 0, 0); + sqlite3_create_function(db, "text_casefold", 1, flags, utf8_casefold, text_change_case, 0, 0); + + // other modifications + sqlite3_create_function(db, "text_replace", 3, flags, 0, text_replace_all, 0, 0); + sqlite3_create_function(db, "text_replace", 4, flags, 0, text_replace, 0, 0); + sqlite3_create_function(db, "text_translate", 3, flags, 0, text_translate, 0, 0); + sqlite3_create_function(db, "translate", 3, flags, 0, text_translate, 0, 0); + sqlite3_create_function(db, "text_reverse", 1, flags, 0, text_reverse, 0, 0); + sqlite3_create_function(db, "reverse", 1, flags, 0, text_reverse, 0, 0); + + // properties + sqlite3_create_function(db, "text_length", 1, flags, 0, text_length, 0, 0); + sqlite3_create_function(db, "char_length", 1, flags, 0, text_length, 0, 0); + sqlite3_create_function(db, "character_length", 1, flags, 0, text_length, 0, 0); + sqlite3_create_function(db, "text_size", 1, flags, 0, text_size, 0, 0); + sqlite3_create_function(db, "octet_length", 1, flags, 0, text_size, 0, 0); + sqlite3_create_function(db, "text_bitsize", 1, flags, 0, text_bit_size, 0, 0); + sqlite3_create_function(db, "bit_length", 1, flags, 0, text_bit_size, 0, 0); + + // collation + sqlite3_create_collation(db, "text_nocase", SQLITE_UTF8, NULL, collate_nocase); + + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/text/extension.h b/libsql-ffi/bundled/sqlean/text/extension.h new file mode 100644 index 0000000000..e511b14805 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with text. + +#ifndef TEXT_EXTENSION_H +#define TEXT_EXTENSION_H + +#include "sqlite3ext.h" + +int text_init(sqlite3* db); + +#endif /* TEXT_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/text/rstring.c b/libsql-ffi/bundled/sqlean/text/rstring.c new file mode 100644 index 0000000000..b3384bec7a --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/rstring.c @@ -0,0 +1,452 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Rune (UTF-8) string data structure. + +#include +#include +#include +#include +#include +#include + +#include "text/rstring.h" +#include "text/runes.h" +#include "text/utf8/rune.h" + +// utf8_length returns the number of utf-8 characters in a string. +static size_t utf8_length(const char* str) { + size_t length = 0; + + while (*str != '\0') { + if (0xf0 == (0xf8 & *str)) { + // 4-byte utf8 code point (began with 0b11110xxx) + str += 4; + } else if (0xe0 == (0xf0 & *str)) { + // 3-byte utf8 code point (began with 0b1110xxxx) + str += 3; + } else if (0xc0 == (0xe0 & *str)) { + // 2-byte utf8 code point (began with 0b110xxxxx) + str += 2; + } else { // if (0x00 == (0x80 & *s)) { + // 1-byte ascii (began with 0b0xxxxxxx) + str += 1; + } + + // no matter the bytes we marched s forward by, it was + // only 1 utf8 codepoint + length++; + } + + return length; +} + +// rstring_new creates an empty string. +RuneString rstring_new(void) { + RuneString str = {.runes = NULL, .length = 0, .size = 0, .owning = true}; + return str; +} + +// rstring_from_runes creates a new string from an array of utf-8 characters. +// `owning` indicates whether the string owns the array and should free the memory when destroyed. +static RuneString rstring_from_runes(const int32_t* const runes, size_t length, bool owning) { + RuneString str = { + .runes = runes, .length = length, .size = length * sizeof(int32_t), .owning = owning}; + return str; +} + +// rstring_from_cstring creates a new string from a zero-terminated C string. +RuneString rstring_from_cstring(const char* const utf8str) { + size_t length = utf8_length(utf8str); + int32_t* runes = length > 0 ? runes_from_cstring(utf8str, length) : NULL; + return rstring_from_runes(runes, length, true); +} + +// rstring_to_cstring converts the string to a zero-terminated C string. +char* rstring_to_cstring(RuneString str) { + return runes_to_cstring(str.runes, str.length); +} + +// rstring_free destroys the string, freeing resources if necessary. +void rstring_free(RuneString str) { + if (str.owning && str.runes != NULL) { + free((void*)str.runes); + } +} + +// rstring_at returns a character by its index in the string. +int32_t rstring_at(RuneString str, size_t idx) { + if (str.length == 0) { + return 0; + } + if (idx < 0 || idx >= str.length) { + return 0; + }; + return str.runes[idx]; +} + +// rstring_slice returns a slice of the string, +// from the `start` index (inclusive) to the `end` index (non-inclusive). +// Negative `start` and `end` values count from the end of the string. +RuneString rstring_slice(RuneString str, int start, int end) { + if (str.length == 0) { + return rstring_new(); + } + + // adjusted start index + start = start < 0 ? (int)str.length + start : start; + // python-compatible: treat negative start index larger than the length of the string as zero + start = start < 0 ? 0 : start; + // adjusted start index should be less the the length of the string + if (start >= (int)str.length) { + return rstring_new(); + } + + // adjusted end index + end = end < 0 ? (int)str.length + end : end; + // python-compatible: treat end index larger than the length of the string + // as equal to the length + end = end > (int)str.length ? (int)str.length : end; + // adjusted end index should be >= 0 + if (end < 0) { + return rstring_new(); + } + + // adjusted start index should be less than adjusted end index + if (start >= end) { + return rstring_new(); + } + + int32_t* at = (int32_t*)str.runes + start; + size_t length = end - start; + RuneString slice = rstring_from_runes(at, length, false); + return slice; +} + +// rstring_substring returns a substring of `length` characters, +// starting from the `start` index. +RuneString rstring_substring(RuneString str, size_t start, size_t length) { + if (length > str.length - start) { + length = str.length - start; + } + return rstring_slice(str, start, start + length); +} + +// rstring_contains_after checks if the other string is a substring of the original string, +// starting at the `start` index. +static bool rstring_contains_after(RuneString str, RuneString other, size_t start) { + if (start + other.length > str.length) { + return false; + } + for (size_t idx = 0; idx < other.length; idx++) { + if (str.runes[start + idx] != other.runes[idx]) { + return false; + } + } + return true; +} + +// rstring_index_char returns the first index of the character in the string +// after the `start` index, inclusive. +static int rstring_index_char(RuneString str, int32_t rune, size_t start) { + for (size_t idx = start; idx < str.length; idx++) { + if (str.runes[idx] == rune) { + return idx; + } + } + return -1; +} + +// rstring_index_char returns the last index of the character in the string +// before the `end` index, inclusive. +static int rstring_last_index_char(RuneString str, int32_t rune, size_t end) { + if (end >= str.length) { + return -1; + } + for (int idx = end; idx >= 0; idx--) { + if (str.runes[idx] == rune) { + return idx; + } + } + return -1; +} + +// rstring_index_after returns the index of the substring in the original string +// after the `start` index, inclusive. +static int rstring_index_after(RuneString str, RuneString other, size_t start) { + if (other.length == 0) { + return start; + } + if (str.length == 0 || other.length > str.length) { + return -1; + } + + size_t cur_idx = start; + while (cur_idx < str.length) { + int match_idx = rstring_index_char(str, other.runes[0], cur_idx); + if (match_idx == -1) { + return match_idx; + } + if (rstring_contains_after(str, other, match_idx)) { + return match_idx; + } + cur_idx = match_idx + 1; + } + return -1; +} + +// rstring_index returns the first index of the substring in the original string. +int rstring_index(RuneString str, RuneString other) { + return rstring_index_after(str, other, 0); +} + +// rstring_last_index returns the last index of the substring in the original string. +int rstring_last_index(RuneString str, RuneString other) { + if (other.length == 0) { + return str.length - 1; + } + if (str.length == 0 || other.length > str.length) { + return -1; + } + + int cur_idx = str.length - 1; + while (cur_idx >= 0) { + int match_idx = rstring_last_index_char(str, other.runes[0], cur_idx); + if (match_idx == -1) { + return match_idx; + } + if (rstring_contains_after(str, other, match_idx)) { + return match_idx; + } + cur_idx = match_idx - 1; + } + + return -1; +} + +// rstring_like returns true if the string matches a LIKE pattern. +bool rstring_like(RuneString pattern, RuneString str) { + size_t pidx = 0, sidx = 0, star_idx = SIZE_MAX, match = 0; + + while (sidx < str.length) { + int32_t prune = (pidx < pattern.length) ? pattern.runes[pidx] : 0; + int32_t srune = str.runes[sidx]; + + if (prune == '%') { + star_idx = ++pidx; + match = ++sidx; + if (pidx == pattern.length) { + return true; + } + } else if (prune == '_' || rune_casefold(prune) == rune_casefold(srune)) { + pidx++; + sidx++; + } else if (star_idx != SIZE_MAX) { + pidx = star_idx; + sidx = match++; + } else { + return false; + } + } + + while (pidx < pattern.length && pattern.runes[pidx] == '%') { + pidx++; + } + return pidx == pattern.length; +} + +// rstring_translate replaces each string character that matches a character in the `from` set with +// the corresponding character in the `to` set. If `from` is longer than `to`, occurrences of the +// extra characters in `from` are deleted. +RuneString rstring_translate(RuneString str, RuneString from, RuneString to) { + if (str.length == 0) { + return rstring_new(); + } + + // empty mapping, return the original string + if (from.length == 0) { + return rstring_from_runes(str.runes, str.length, false); + } + + // resulting string can be no longer than the original one + int32_t* runes = calloc(str.length, sizeof(int32_t)); + if (runes == NULL) { + return rstring_new(); + } + + // but it may be shorter, so we should track its length separately + size_t length = 0; + // perform the translation + for (size_t idx = 0; idx < str.length; idx++) { + size_t k = 0; + // map idx-th character in str `from` -> `to` + for (; k < from.length && k < to.length; k++) { + if (str.runes[idx] == from.runes[k]) { + runes[length] = to.runes[k]; + length++; + break; + } + } + // if `from` is longer than `to`, ingore idx-th character found in `from` + bool ignore = false; + for (; k < from.length; k++) { + if (str.runes[idx] == from.runes[k]) { + ignore = true; + break; + } + } + // else copy idx-th character as is + if (!ignore) { + runes[length] = str.runes[idx]; + length++; + } + } + + return rstring_from_runes(runes, length, true); +} + +// rstring_reverse returns the reversed string. +RuneString rstring_reverse(RuneString str) { + int32_t* runes = (int32_t*)str.runes; + for (size_t i = 0; i < str.length / 2; i++) { + int32_t r = runes[i]; + runes[i] = runes[str.length - 1 - i]; + runes[str.length - 1 - i] = r; + } + RuneString res = rstring_from_runes(runes, str.length, false); + return res; +} + +// rstring_trim_left trims certain characters from the beginning of the string. +RuneString rstring_trim_left(RuneString str, RuneString chars) { + if (str.length == 0) { + return rstring_new(); + } + size_t idx = 0; + for (; idx < str.length; idx++) { + if (rstring_index_char(chars, str.runes[idx], 0) == -1) { + break; + } + } + return rstring_slice(str, idx, str.length); +} + +// rstring_trim_right trims certain characters from the end of the string. +RuneString rstring_trim_right(RuneString str, RuneString chars) { + if (str.length == 0) { + return rstring_new(); + } + int idx = str.length - 1; + for (; idx >= 0; idx--) { + if (rstring_index_char(chars, str.runes[idx], 0) == -1) { + break; + } + } + return rstring_slice(str, 0, idx + 1); +} + +// rstring_trim trims certain characters from the beginning and end of the string. +RuneString rstring_trim(RuneString str, RuneString chars) { + if (str.length == 0) { + return rstring_new(); + } + size_t left = 0; + for (; left < str.length; left++) { + if (rstring_index_char(chars, str.runes[left], 0) == -1) { + break; + } + } + int right = str.length - 1; + for (; right >= 0; right--) { + if (rstring_index_char(chars, str.runes[right], 0) == -1) { + break; + } + } + return rstring_slice(str, left, right + 1); +} + +// rstring_pad_left pads the string to the specified length by prepending `fill` characters. +// If the string is already longer than the specified length, it is truncated on the right. +RuneString rstring_pad_left(RuneString str, size_t length, RuneString fill) { + if (str.length >= length) { + // If the string is already longer than length, return a truncated version of the string + return rstring_substring(str, 0, length); + } + + if (fill.length == 0) { + // If the fill string is empty, return the original string + return rstring_from_runes(str.runes, str.length, false); + } + + // Calculate the number of characters to pad + size_t pad_langth = length - str.length; + + // Allocate memory for the padded string + size_t new_size = (str.length + pad_langth) * sizeof(int32_t); + int32_t* new_runes = malloc(new_size); + if (new_runes == NULL) { + return rstring_new(); + } + + // Copy the fill characters to the beginning of the new string + for (size_t i = 0; i < pad_langth; i++) { + new_runes[i] = fill.runes[i % fill.length]; + } + + // Copy the original string to the end of the new string + memcpy(&new_runes[pad_langth], str.runes, str.size); + + // Return the new string + RuneString new_str = rstring_from_runes(new_runes, length, true); + return new_str; +} + +// rstring_pad_right pads the string to the specified length by appending `fill` characters. +// If the string is already longer than the specified length, it is truncated on the right. +RuneString rstring_pad_right(RuneString str, size_t length, RuneString fill) { + if (str.length >= length) { + // If the string is already longer than length, return a truncated version of the string + return rstring_substring(str, 0, length); + } + + if (fill.length == 0) { + // If the fill string is empty, return the original string + return rstring_from_runes(str.runes, str.length, false); + } + + // Calculate the number of characters to pad + size_t pad_length = length - str.length; + + // Allocate memory for the padded string + size_t new_size = (str.length + pad_length) * sizeof(int32_t); + int32_t* new_runes = malloc(new_size); + if (new_runes == NULL) { + return rstring_new(); + } + + // Copy the original string to the beginning of the new string + memcpy(new_runes, str.runes, str.size); + + // Copy the fill characters to the end of the new string + for (size_t i = str.length; i < length; i++) { + new_runes[i] = fill.runes[(i - str.length) % fill.length]; + } + + // Return the new string + RuneString new_str = rstring_from_runes(new_runes, length, true); + return new_str; +} + +// rstring_print prints the string to stdout. +void rstring_print(RuneString str) { + if (str.length == 0) { + printf("'' (len=0)\n"); + return; + } + printf("'"); + for (size_t i = 0; i < str.length; i++) { + printf("%08x ", str.runes[i]); + } + printf("' (len=%zu)", str.length); + printf("\n"); +} diff --git a/libsql-ffi/bundled/sqlean/text/rstring.h b/libsql-ffi/bundled/sqlean/text/rstring.h new file mode 100644 index 0000000000..96ea501e27 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/rstring.h @@ -0,0 +1,51 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Rune (UTF-8) string data structure. + +#ifndef RSTRING_H +#define RSTRING_H + +#include +#include +#include + +// RuneString is a string composed of UTF-8 characters (runes). +typedef struct { + // array of utf-8 characters + const int32_t* runes; + // number of characters in the string + size_t length; + // number of bytes in the string + size_t size; + // indicates whether the string owns the array + // and should free the memory when destroyed + bool owning; +} RuneString; + +// RuneString methods. +RuneString rstring_new(void); +RuneString rstring_from_cstring(const char* const utf8str); +char* rstring_to_cstring(RuneString str); +void rstring_free(RuneString str); + +int32_t rstring_at(RuneString str, size_t idx); +RuneString rstring_slice(RuneString str, int start, int end); +RuneString rstring_substring(RuneString str, size_t start, size_t length); + +int rstring_index(RuneString str, RuneString other); +int rstring_last_index(RuneString str, RuneString other); +bool rstring_like(RuneString pattern, RuneString str); + +RuneString rstring_translate(RuneString str, RuneString from, RuneString to); +RuneString rstring_reverse(RuneString str); + +RuneString rstring_trim_left(RuneString str, RuneString chars); +RuneString rstring_trim_right(RuneString str, RuneString chars); +RuneString rstring_trim(RuneString str, RuneString chars); +RuneString rstring_pad_left(RuneString str, size_t length, RuneString fill); +RuneString rstring_pad_right(RuneString str, size_t length, RuneString fill); + +void rstring_print(RuneString str); + +#endif /* RSTRING_H */ diff --git a/libsql-ffi/bundled/sqlean/text/runes.c b/libsql-ffi/bundled/sqlean/text/runes.c new file mode 100644 index 0000000000..7628d5a871 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/runes.c @@ -0,0 +1,63 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// UTF-8 characters (runes) <-> C string conversions. + +#include +#include +#include +#include + +#include "text/runes.h" +#include "text/utf8/utf8.h" + +// runes_from_cstring creates an array of runes from a C string. +int32_t* runes_from_cstring(const char* const str, size_t length) { + assert(length > 0); + int32_t* runes = calloc(length, sizeof(int32_t)); + if (runes == NULL) { + return NULL; + } + + utf8_decode_t d = {.state = 0}; + const char* s = str; + size_t idx = 0; + while (idx < length && *s != 0) { + do { + utf8_decode(&d, (uint8_t)*s++); + } while (d.state); + runes[idx] = d.codep; + idx += 1; + } + + return runes; +} + +// runes_to_cstring creates a C string from an array of runes. +char* runes_to_cstring(const int32_t* runes, size_t length) { + char* str; + if (length == 0) { + str = calloc(1, sizeof(char)); + return str; + } + + size_t maxlen = length * sizeof(int32_t) + 1; + str = malloc(maxlen); + if (str == NULL) { + return NULL; + } + + char* at = str; + for (size_t i = 0; i < length; i++) { + at += utf8_encode(at, runes[i]); + } + *at = '\0'; + at += 1; + + if ((size_t)(at - str) < maxlen) { + // shrink to real size + size_t size = at - str; + str = realloc(str, size); + } + return str; +} diff --git a/libsql-ffi/bundled/sqlean/text/runes.h b/libsql-ffi/bundled/sqlean/text/runes.h new file mode 100644 index 0000000000..643a181032 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/runes.h @@ -0,0 +1,15 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// UTF-8 characters (runes) <-> C string conversions. + +#ifndef RUNES_H +#define RUNES_H + +#include +#include + +int32_t* runes_from_cstring(const char* const str, size_t length); +char* runes_to_cstring(const int32_t* runes, size_t length); + +#endif /* RUNES_H */ diff --git a/libsql-ffi/bundled/sqlean/text/utf8/case.c b/libsql-ffi/bundled/sqlean/text/utf8/case.c new file mode 100644 index 0000000000..2e4fcff97f --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/case.c @@ -0,0 +1,67 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Case conversion functions for utf8 strings. + +#include +#include +#include + +#include "text/utf8/rune.h" +#include "text/utf8/utf8.h" + +// utf8_transform converts the utf8 string s using the transform function. +static bool utf8_transform(char* s, size_t n, uint32_t (*transform)(uint32_t)) { + utf8_decode_t d = {.state = 0}; + while ((n > 0) & (*s != 0)) { + size_t i = 0; + do { + utf8_decode(&d, (uint8_t)s[i++]); + } while (d.state); + uint32_t c = transform(d.codep); + int len = utf8_encode(s, c); + if (len == 0) { + return false; + } + s += len; + n -= len; + } + return true; +} + +// utf8_tolower converts the utf8 string s to lowercase. +// Returns true if successful, false if an error occurred. +bool utf8_tolower(char* s, size_t n) { + return utf8_transform(s, n, rune_tolower); +} + +// utf8_toupper converts the utf8 string s to uppercase. +bool utf8_toupper(char* s, size_t n) { + return utf8_transform(s, n, rune_toupper); +} + +// utf8_casefold converts the utf8 string s to folded-case. +bool utf8_casefold(char* s, size_t n) { + return utf8_transform(s, n, rune_casefold); +} + +// utf8_totitle converts the utf8 string s to title-case. +bool utf8_totitle(char* s, size_t n) { + utf8_decode_t d = {.state = 0}; + bool upper = true; + while ((n > 0) & (*s != 0)) { + size_t i = 0; + do { + utf8_decode(&d, (uint8_t)s[i++]); + } while (d.state); + uint32_t c = upper ? rune_toupper(d.codep) : rune_tolower(d.codep); + int len = utf8_encode(s, c); + if (len == 0) { + return false; + } + upper = !rune_isword(d.codep); + s += len; + n -= len; + } + return true; +} diff --git a/libsql-ffi/bundled/sqlean/text/utf8/groups.h b/libsql-ffi/bundled/sqlean/text/utf8/groups.h new file mode 100644 index 0000000000..ac35b37a11 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/groups.h @@ -0,0 +1,159 @@ +#ifndef UTF8_GROUPS_H +#define UTF8_GROUPS_H + +/* The tables below are extracted from the RE2 library */ +#include +#include "rune.h" + +typedef struct { + uint16_t lo; + uint16_t hi; +} URange16; + +typedef struct { + const URange16* r16; + int nr16; +} UGroup; + +static const URange16 Cc_range16[] = { + // Control + {0, 31}, + {127, 159}, +}; + +static const URange16 Lt_range16[] = { + // Title case + {453, 453}, {456, 456}, {459, 459}, {498, 498}, {8072, 8079}, + {8088, 8095}, {8104, 8111}, {8124, 8124}, {8140, 8140}, {8188, 8188}, +}; + +static const URange16 Nd_range16[] = { + // Decimal number + {48, 57}, {1632, 1641}, {1776, 1785}, {1984, 1993}, {2406, 2415}, {2534, 2543}, + {2662, 2671}, {2790, 2799}, {2918, 2927}, {3046, 3055}, {3174, 3183}, {3302, 3311}, + {3430, 3439}, {3558, 3567}, {3664, 3673}, {3792, 3801}, {3872, 3881}, {4160, 4169}, + {4240, 4249}, {6112, 6121}, {6160, 6169}, {6470, 6479}, {6608, 6617}, {6784, 6793}, + {6800, 6809}, {6992, 7001}, {7088, 7097}, {7232, 7241}, {7248, 7257}, {42528, 42537}, + {43216, 43225}, {43264, 43273}, {43472, 43481}, {43504, 43513}, {43600, 43609}, {44016, 44025}, + {65296, 65305}, +}; + +static const URange16 Nl_range16[] = { + // Number letter + {5870, 5872}, {8544, 8578}, {8581, 8584}, {12295, 12295}, + {12321, 12329}, {12344, 12346}, {42726, 42735}, +}; + +static const URange16 Pc_range16[] = { + // Connector punctuation + {95, 95}, {8255, 8256}, {8276, 8276}, {65075, 65076}, {65101, 65103}, {65343, 65343}, +}; + +static const URange16 Pd_range16[] = { + // Dash punctuation + {45, 45}, {1418, 1418}, {1470, 1470}, {5120, 5120}, {6150, 6150}, {8208, 8213}, + {11799, 11799}, {11802, 11802}, {11834, 11835}, {11840, 11840}, {11869, 11869}, {12316, 12316}, + {12336, 12336}, {12448, 12448}, {65073, 65074}, {65112, 65112}, {65123, 65123}, {65293, 65293}, +}; + +static const URange16 Pf_range16[] = { + // Final punctuation + {187, 187}, {8217, 8217}, {8221, 8221}, {8250, 8250}, {11779, 11779}, + {11781, 11781}, {11786, 11786}, {11789, 11789}, {11805, 11805}, {11809, 11809}, +}; + +static const URange16 Pi_range16[] = { + // Initial punctuation + {171, 171}, {8216, 8216}, {8219, 8220}, {8223, 8223}, {8249, 8249}, {11778, 11778}, + {11780, 11780}, {11785, 11785}, {11788, 11788}, {11804, 11804}, {11808, 11808}, +}; + +static const URange16 Sc_range16[] = { + // Currency symbol + {36, 36}, {162, 165}, {1423, 1423}, {1547, 1547}, {2046, 2047}, {2546, 2547}, + {2555, 2555}, {2801, 2801}, {3065, 3065}, {3647, 3647}, {6107, 6107}, {8352, 8384}, + {43064, 43064}, {65020, 65020}, {65129, 65129}, {65284, 65284}, {65504, 65505}, {65509, 65510}, +}; + +static const URange16 Zl_range16[] = { + // Line separator + {8232, 8232}, +}; + +static const URange16 Zp_range16[] = { + // Paragraph separator + {8233, 8233}, +}; + +static const URange16 Zs_range16[] = { + // Space separator + {32, 32}, {160, 160}, {5760, 5760}, {8192, 8202}, {8239, 8239}, {8287, 8287}, {12288, 12288}, +}; + +static const URange16 Arabic_range16[] = { + {1536, 1540}, {1542, 1547}, {1549, 1562}, {1564, 1566}, {1568, 1599}, {1601, 1610}, + {1622, 1647}, {1649, 1756}, {1758, 1791}, {1872, 1919}, {2160, 2190}, {2192, 2193}, + {2200, 2273}, {2275, 2303}, {64336, 64450}, {64467, 64829}, {64832, 64911}, {64914, 64967}, + {64975, 64975}, {65008, 65023}, {65136, 65140}, {65142, 65276}, +}; + +static const URange16 Cyrillic_range16[] = { + {1024, 1156}, {1159, 1327}, {7296, 7304}, {7467, 7467}, + {7544, 7544}, {11744, 11775}, {42560, 42655}, {65070, 65071}, +}; + +static const URange16 Devanagari_range16[] = { + {2304, 2384}, + {2389, 2403}, + {2406, 2431}, + {43232, 43263}, +}; + +static const URange16 Greek_range16[] = { + {880, 883}, {885, 887}, {890, 893}, {895, 895}, {900, 900}, {902, 902}, + {904, 906}, {908, 908}, {910, 929}, {931, 993}, {1008, 1023}, {7462, 7466}, + {7517, 7521}, {7526, 7530}, {7615, 7615}, {7936, 7957}, {7960, 7965}, {7968, 8005}, + {8008, 8013}, {8016, 8023}, {8025, 8025}, {8027, 8027}, {8029, 8029}, {8031, 8061}, + {8064, 8116}, {8118, 8132}, {8134, 8147}, {8150, 8155}, {8157, 8175}, {8178, 8180}, + {8182, 8190}, {8486, 8486}, {43877, 43877}, +}; + +static const URange16 Han_range16[] = { + {11904, 11929}, {11931, 12019}, {12032, 12245}, {12293, 12293}, {12295, 12295}, {12321, 12329}, + {12344, 12347}, {13312, 19903}, {19968, 40959}, {63744, 64109}, {64112, 64217}, +}; + +static const URange16 Latin_range16[] = { + {65, 90}, {97, 122}, {170, 170}, {186, 186}, {192, 214}, {216, 246}, + {248, 696}, {736, 740}, {7424, 7461}, {7468, 7516}, {7522, 7525}, {7531, 7543}, + {7545, 7614}, {7680, 7935}, {8305, 8305}, {8319, 8319}, {8336, 8348}, {8490, 8491}, + {8498, 8498}, {8526, 8526}, {8544, 8584}, {11360, 11391}, {42786, 42887}, {42891, 42954}, + {42960, 42961}, {42963, 42963}, {42965, 42969}, {42994, 43007}, {43824, 43866}, {43868, 43876}, + {43878, 43881}, {64256, 64262}, {65313, 65338}, {65345, 65370}, +}; + +#define UNI_ENTRY(Code) {Code##_range16, sizeof(Code##_range16) / sizeof(URange16)} +#define _e_arg(k, v) [k] = v + +static const UGroup _utf8_unicode_groups[U8G_SIZE] = { + [U8G_Cc] = UNI_ENTRY(Cc), + [U8G_Lt] = UNI_ENTRY(Lt), + [U8G_Nd] = UNI_ENTRY(Nd), + [U8G_Nl] = UNI_ENTRY(Nl), + [U8G_Pc] = UNI_ENTRY(Pc), + [U8G_Pd] = UNI_ENTRY(Pd), + [U8G_Pf] = UNI_ENTRY(Pf), + [U8G_Pi] = UNI_ENTRY(Pi), + [U8G_Sc] = UNI_ENTRY(Sc), + [U8G_Zl] = UNI_ENTRY(Zl), + [U8G_Zp] = UNI_ENTRY(Zp), + [U8G_Zs] = UNI_ENTRY(Zs), + [U8G_Arabic] = UNI_ENTRY(Arabic), + [U8G_Cyrillic] = UNI_ENTRY(Cyrillic), + [U8G_Devanagari] = UNI_ENTRY(Devanagari), + [U8G_Greek] = UNI_ENTRY(Greek), + [U8G_Han] = UNI_ENTRY(Han), + [U8G_Latin] = UNI_ENTRY(Latin), +}; + +#endif // UTF8_GROUPS_H diff --git a/libsql-ffi/bundled/sqlean/text/utf8/rune.c b/libsql-ffi/bundled/sqlean/text/utf8/rune.c new file mode 100644 index 0000000000..7e529e9571 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/rune.c @@ -0,0 +1,178 @@ +/* MIT License + * + * Copyright (c) 2023 Tyge Løvset + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Rune (utf8 codepoint) handling. + +#include +#include +#include +#include + +#include "text/utf8/rune.h" + +#include "text/utf8/groups.h" +#include "text/utf8/tables.h" + +#define c_arraylen(a) (size_t)(sizeof(a) / sizeof 0 [a]) + +// rune_isgroup returns true if c is in the unicode group. +static bool rune_isgroup(int group, uint32_t c) { + for (int j = 0; j < _utf8_unicode_groups[group].nr16; ++j) { + if (c < _utf8_unicode_groups[group].r16[j].lo) { + return false; + } + if (c <= _utf8_unicode_groups[group].r16[j].hi) { + return true; + } + } + return false; +} + +// rune_isupper returns true if c is an uppercase letter. +bool rune_isupper(uint32_t c) { + return rune_tolower(c) != c; +} + +// rune_islower returns true if c is a lowercase letter. +bool rune_islower(uint32_t c) { + return rune_toupper(c) != c; +} + +// rune_isdigit returns true if c is a digit character. +bool rune_isdigit(uint32_t c) { + if (c < 128) { + return isdigit((int)c) != 0; + } + return rune_isgroup(U8G_Nd, c); +} + +// rune_isalpha returns true if c is an alphabetic character. +bool rune_isalpha(uint32_t c) { + static int16_t groups[] = {U8G_Latin, U8G_Nl, U8G_Greek, U8G_Cyrillic, + U8G_Han, U8G_Devanagari, U8G_Arabic}; + if (c < 128) { + return isalpha((int)c) != 0; + } + for (size_t j = 0; j < c_arraylen(groups); ++j) { + if (rune_isgroup(groups[j], c)) { + return true; + } + } + return false; +} + +// rune_isalnum returns true if c is an alphanumeric character. +bool rune_isalnum(uint32_t c) { + if (c < 128) { + return isalnum((int)c) != 0; + } + return rune_isalpha(c) || rune_isgroup(U8G_Nd, c); +} + +// rune_isblank returns true if c is a blank character. +bool rune_isblank(uint32_t c) { + if (c < 128) { + return (c == ' ') | (c == '\t'); + } + return rune_isgroup(U8G_Zs, c); +} + +// rune_isspace returns true if c is a whitespace character. +bool rune_isspace(uint32_t c) { + if (c < 128) { + return isspace((int)c) != 0; + } + return ((c == 8232) | (c == 8233)) || rune_isgroup(U8G_Zs, c); +} + +// rune_iscased returns true if c is a cased character. +bool rune_iscased(uint32_t c) { + if (c < 128) { + return isalpha((int)c) != 0; + } + return rune_islower(c) || rune_isupper(c) || rune_isgroup(U8G_Lt, c); +} + +// rune_isword returns true if c is a word character. +bool rune_isword(uint32_t c) { + if (c < 128) { + return (isalnum((int)c) != 0) | (c == '_'); + } + return rune_isalpha(c) || rune_isgroup(U8G_Nd, c) || rune_isgroup(U8G_Pc, c); +} + +// Character transformation functions. + +// rune_casefold returns the unicode casefold of c. +uint32_t rune_casefold(uint32_t c) { + for (int i = 0; i < casefold_len; ++i) { + const struct CaseMapping entry = casemappings[i]; + if (c <= entry.c2) { + if (c < entry.c1) { + return c; + } + int d = entry.m2 - entry.c2; + if (d == 1) { + return c + ((entry.c2 & 1) == (c & 1)); + } + return (uint32_t)((int)c + d); + } + } + return c; +} + +// rune_tolower returns the lowercase version of c. +uint32_t rune_tolower(uint32_t c) { + for (int i = 0; i < (int)(sizeof upcase_ind / sizeof *upcase_ind); ++i) { + const struct CaseMapping entry = casemappings[upcase_ind[i]]; + if (c <= entry.c2) { + if (c < entry.c1) { + return c; + } + int d = entry.m2 - entry.c2; + if (d == 1) { + return c + ((entry.c2 & 1) == (c & 1)); + } + return (uint32_t)((int)c + d); + } + } + return c; +} + +// rune_toupper returns the uppercase version of c. +uint32_t rune_toupper(uint32_t c) { + for (int i = 0; i < (int)(sizeof lowcase_ind / sizeof *lowcase_ind); ++i) { + const struct CaseMapping entry = casemappings[lowcase_ind[i]]; + if (c <= entry.m2) { + int d = entry.m2 - entry.c2; + if (c < (uint32_t)(entry.c1 + d)) { + return c; + } + if (d == 1) { + return c - ((entry.m2 & 1) == (c & 1)); + } + return (uint32_t)((int)c - d); + } + } + return c; +} diff --git a/libsql-ffi/bundled/sqlean/text/utf8/rune.h b/libsql-ffi/bundled/sqlean/text/utf8/rune.h new file mode 100644 index 0000000000..2941420d83 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/rune.h @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Rune (utf8 codepoint) handling. + +#ifndef UTF8_RUNE_H +#define UTF8_RUNE_H + +#include +#include + +enum { + U8G_Cc, + U8G_Lt, + U8G_Nd, + U8G_Nl, + U8G_Pc, + U8G_Pd, + U8G_Pf, + U8G_Pi, + U8G_Sc, + U8G_Zl, + U8G_Zp, + U8G_Zs, + U8G_Arabic, + U8G_Cyrillic, + U8G_Devanagari, + U8G_Greek, + U8G_Han, + U8G_Latin, + U8G_SIZE +}; + +bool rune_isupper(uint32_t c); +bool rune_islower(uint32_t c); +bool rune_isdigit(uint32_t c); +bool rune_isalpha(uint32_t c); +bool rune_isalnum(uint32_t c); +bool rune_isblank(uint32_t c); +bool rune_isspace(uint32_t c); +bool rune_iscased(uint32_t c); +bool rune_isword(uint32_t c); + +uint32_t rune_casefold(uint32_t c); +uint32_t rune_tolower(uint32_t c); +uint32_t rune_toupper(uint32_t c); + +#endif // UTF8_RUNE_H \ No newline at end of file diff --git a/libsql-ffi/bundled/sqlean/text/utf8/tables.h b/libsql-ffi/bundled/sqlean/text/utf8/tables.h new file mode 100644 index 0000000000..217c326ef7 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/tables.h @@ -0,0 +1,266 @@ +#ifndef UTF8_TABLES_H +#define UTF8_TABLES_H + +#include + +struct CaseMapping { + // c1 - rune at the start of the uppercase range + // c2 - rune at the end of the uppercase range + // m2 - rune at the end of the lowercase range + uint16_t c1, c2, m2; +}; + +static struct CaseMapping casemappings[] = { + {0x0041, 0x005A, 0x007A}, // A a (26) LATIN CAPITAL LETTER A + {0x00B5, 0x00B5, 0x03BC}, // µ μ ( 1) MICRO SIGN + {0x00C0, 0x00D6, 0x00F6}, // À à (23) LATIN CAPITAL LETTER A WITH GRAVE + {0x00D8, 0x00DE, 0x00FE}, // Ø ø ( 7) LATIN CAPITAL LETTER O WITH STROKE + {0x0100, 0x012E, 0x012F}, // Ā ā (24) LATIN CAPITAL LETTER A WITH MACRON + {0x0132, 0x0136, 0x0137}, // IJ ij ( 3) LATIN CAPITAL LIGATURE IJ + {0x0139, 0x0147, 0x0148}, // Ĺ ĺ ( 8) LATIN CAPITAL LETTER L WITH ACUTE + {0x014A, 0x0176, 0x0177}, // Ŋ ŋ (23) LATIN CAPITAL LETTER ENG + {0x0178, 0x0178, 0x00FF}, // Ÿ ÿ ( 1) LATIN CAPITAL LETTER Y WITH DIAERESIS + {0x0179, 0x017D, 0x017E}, // Ź ź ( 3) LATIN CAPITAL LETTER Z WITH ACUTE + {0x017F, 0x017F, 0x0073}, // ſ s ( 1) LATIN SMALL LETTER LONG S + {0x0181, 0x0181, 0x0253}, // Ɓ ɓ ( 1) LATIN CAPITAL LETTER B WITH HOOK + {0x0182, 0x0184, 0x0185}, // Ƃ ƃ ( 2) LATIN CAPITAL LETTER B WITH TOPBAR + {0x0186, 0x0186, 0x0254}, // Ɔ ɔ ( 1) LATIN CAPITAL LETTER OPEN O + {0x0187, 0x0187, 0x0188}, // Ƈ ƈ ( 1) LATIN CAPITAL LETTER C WITH HOOK + {0x0189, 0x018A, 0x0257}, // Ɖ ɖ ( 2) LATIN CAPITAL LETTER AFRICAN D + {0x018B, 0x018B, 0x018C}, // Ƌ ƌ ( 1) LATIN CAPITAL LETTER D WITH TOPBAR + {0x018E, 0x018E, 0x01DD}, // Ǝ ǝ ( 1) LATIN CAPITAL LETTER REVERSED E + {0x018F, 0x018F, 0x0259}, // Ə ə ( 1) LATIN CAPITAL LETTER SCHWA + {0x0190, 0x0190, 0x025B}, // Ɛ ɛ ( 1) LATIN CAPITAL LETTER OPEN E + {0x0191, 0x0191, 0x0192}, // Ƒ ƒ ( 1) LATIN CAPITAL LETTER F WITH HOOK + {0x0193, 0x0193, 0x0260}, // Ɠ ɠ ( 1) LATIN CAPITAL LETTER G WITH HOOK + {0x0194, 0x0194, 0x0263}, // Ɣ ɣ ( 1) LATIN CAPITAL LETTER GAMMA + {0x0196, 0x0196, 0x0269}, // Ɩ ɩ ( 1) LATIN CAPITAL LETTER IOTA + {0x0197, 0x0197, 0x0268}, // Ɨ ɨ ( 1) LATIN CAPITAL LETTER I WITH STROKE + {0x0198, 0x0198, 0x0199}, // Ƙ ƙ ( 1) LATIN CAPITAL LETTER K WITH HOOK + {0x019C, 0x019C, 0x026F}, // Ɯ ɯ ( 1) LATIN CAPITAL LETTER TURNED M + {0x019D, 0x019D, 0x0272}, // Ɲ ɲ ( 1) LATIN CAPITAL LETTER N WITH LEFT HOOK + {0x019F, 0x019F, 0x0275}, // Ɵ ɵ ( 1) LATIN CAPITAL LETTER O WITH MIDDLE TILDE + {0x01A0, 0x01A4, 0x01A5}, // Ơ ơ ( 3) LATIN CAPITAL LETTER O WITH HORN + {0x01A6, 0x01A6, 0x0280}, // Ʀ ʀ ( 1) LATIN LETTER YR + {0x01A7, 0x01A7, 0x01A8}, // Ƨ ƨ ( 1) LATIN CAPITAL LETTER TONE TWO + {0x01A9, 0x01A9, 0x0283}, // Ʃ ʃ ( 1) LATIN CAPITAL LETTER ESH + {0x01AC, 0x01AC, 0x01AD}, // Ƭ ƭ ( 1) LATIN CAPITAL LETTER T WITH HOOK + {0x01AE, 0x01AE, 0x0288}, // Ʈ ʈ ( 1) LATIN CAPITAL LETTER T WITH RETROFLEX HOOK + {0x01AF, 0x01AF, 0x01B0}, // Ư ư ( 1) LATIN CAPITAL LETTER U WITH HORN + {0x01B1, 0x01B2, 0x028B}, // Ʊ ʊ ( 2) LATIN CAPITAL LETTER UPSILON + {0x01B3, 0x01B5, 0x01B6}, // Ƴ ƴ ( 2) LATIN CAPITAL LETTER Y WITH HOOK + {0x01B7, 0x01B7, 0x0292}, // Ʒ ʒ ( 1) LATIN CAPITAL LETTER EZH + {0x01B8, 0x01B8, 0x01B9}, // Ƹ ƹ ( 1) LATIN CAPITAL LETTER EZH REVERSED + {0x01BC, 0x01BC, 0x01BD}, // Ƽ ƽ ( 1) LATIN CAPITAL LETTER TONE FIVE + {0x01C4, 0x01C4, 0x01C6}, // DŽ dž ( 1) LATIN CAPITAL LETTER DZ WITH CARON + {0x01C5, 0x01C5, 0x01C6}, // Dž dž ( 1) LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + {0x01C7, 0x01C7, 0x01C9}, // LJ lj ( 1) LATIN CAPITAL LETTER LJ + {0x01C8, 0x01C8, 0x01C9}, // Lj lj ( 1) LATIN CAPITAL LETTER L WITH SMALL LETTER J + {0x01CA, 0x01CA, 0x01CC}, // NJ nj ( 1) LATIN CAPITAL LETTER NJ + {0x01CB, 0x01DB, 0x01DC}, // Nj nj ( 9) LATIN CAPITAL LETTER N WITH SMALL LETTER J + {0x01DE, 0x01EE, 0x01EF}, // Ǟ ǟ ( 9) LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON + {0x01F1, 0x01F1, 0x01F3}, // DZ dz ( 1) LATIN CAPITAL LETTER DZ + {0x01F2, 0x01F4, 0x01F5}, // Dz dz ( 2) LATIN CAPITAL LETTER D WITH SMALL LETTER Z + {0x01F6, 0x01F6, 0x0195}, // Ƕ ƕ ( 1) LATIN CAPITAL LETTER HWAIR + {0x01F7, 0x01F7, 0x01BF}, // Ƿ ƿ ( 1) LATIN CAPITAL LETTER WYNN + {0x01F8, 0x021E, 0x021F}, // Ǹ ǹ (20) LATIN CAPITAL LETTER N WITH GRAVE + {0x0220, 0x0220, 0x019E}, // Ƞ ƞ ( 1) LATIN CAPITAL LETTER N WITH LONG RIGHT LEG + {0x0222, 0x0232, 0x0233}, // Ȣ ȣ ( 9) LATIN CAPITAL LETTER OU + {0x023A, 0x023A, 0x2C65}, // Ⱥ ⱥ ( 1) LATIN CAPITAL LETTER A WITH STROKE + {0x023B, 0x023B, 0x023C}, // Ȼ ȼ ( 1) LATIN CAPITAL LETTER C WITH STROKE + {0x023D, 0x023D, 0x019A}, // Ƚ ƚ ( 1) LATIN CAPITAL LETTER L WITH BAR + {0x023E, 0x023E, 0x2C66}, // Ⱦ ⱦ ( 1) LATIN CAPITAL LETTER T WITH DIAGONAL STROKE + {0x0241, 0x0241, 0x0242}, // Ɂ ɂ ( 1) LATIN CAPITAL LETTER GLOTTAL STOP + {0x0243, 0x0243, 0x0180}, // Ƀ ƀ ( 1) LATIN CAPITAL LETTER B WITH STROKE + {0x0244, 0x0244, 0x0289}, // Ʉ ʉ ( 1) LATIN CAPITAL LETTER U BAR + {0x0245, 0x0245, 0x028C}, // Ʌ ʌ ( 1) LATIN CAPITAL LETTER TURNED V + {0x0246, 0x024E, 0x024F}, // Ɇ ɇ ( 5) LATIN CAPITAL LETTER E WITH STROKE + {0x0345, 0x0345, 0x03B9}, // ͅ ι ( 1) COMBINING GREEK YPOGEGRAMMENI + {0x0370, 0x0372, 0x0373}, // Ͱ ͱ ( 2) GREEK CAPITAL LETTER HETA + {0x0376, 0x0376, 0x0377}, // Ͷ ͷ ( 1) GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA + {0x037F, 0x037F, 0x03F3}, // Ϳ ϳ ( 1) GREEK CAPITAL LETTER YOT + {0x0386, 0x0386, 0x03AC}, // Ά ά ( 1) GREEK CAPITAL LETTER ALPHA WITH TONOS + {0x0388, 0x038A, 0x03AF}, // Έ έ ( 3) GREEK CAPITAL LETTER EPSILON WITH TONOS + {0x038C, 0x038C, 0x03CC}, // Ό ό ( 1) GREEK CAPITAL LETTER OMICRON WITH TONOS + {0x038E, 0x038F, 0x03CE}, // Ύ ύ ( 2) GREEK CAPITAL LETTER UPSILON WITH TONOS + {0x0391, 0x03A1, 0x03C1}, // Α α (17) GREEK CAPITAL LETTER ALPHA + {0x03A3, 0x03AB, 0x03CB}, // Σ σ ( 9) GREEK CAPITAL LETTER SIGMA + {0x03C2, 0x03C2, 0x03C3}, // ς σ ( 1) GREEK SMALL LETTER FINAL SIGMA + {0x03CF, 0x03CF, 0x03D7}, // Ϗ ϗ ( 1) GREEK CAPITAL KAI SYMBOL + {0x03D0, 0x03D0, 0x03B2}, // ϐ β ( 1) GREEK BETA SYMBOL + {0x03D1, 0x03D1, 0x03B8}, // ϑ θ ( 1) GREEK THETA SYMBOL + {0x03D5, 0x03D5, 0x03C6}, // ϕ φ ( 1) GREEK PHI SYMBOL + {0x03D6, 0x03D6, 0x03C0}, // ϖ π ( 1) GREEK PI SYMBOL + {0x03D8, 0x03EE, 0x03EF}, // Ϙ ϙ (12) GREEK LETTER ARCHAIC KOPPA + {0x03F0, 0x03F0, 0x03BA}, // ϰ κ ( 1) GREEK KAPPA SYMBOL + {0x03F1, 0x03F1, 0x03C1}, // ϱ ρ ( 1) GREEK RHO SYMBOL + {0x03F4, 0x03F4, 0x03B8}, // ϴ θ ( 1) GREEK CAPITAL THETA SYMBOL + {0x03F5, 0x03F5, 0x03B5}, // ϵ ε ( 1) GREEK LUNATE EPSILON SYMBOL + {0x03F7, 0x03F7, 0x03F8}, // Ϸ ϸ ( 1) GREEK CAPITAL LETTER SHO + {0x03F9, 0x03F9, 0x03F2}, // Ϲ ϲ ( 1) GREEK CAPITAL LUNATE SIGMA SYMBOL + {0x03FA, 0x03FA, 0x03FB}, // Ϻ ϻ ( 1) GREEK CAPITAL LETTER SAN + {0x03FD, 0x03FF, 0x037D}, // Ͻ ͻ ( 3) GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL + {0x0400, 0x040F, 0x045F}, // Ѐ ѐ (16) CYRILLIC CAPITAL LETTER IE WITH GRAVE + {0x0410, 0x042F, 0x044F}, // А а (32) CYRILLIC CAPITAL LETTER A + {0x0460, 0x0480, 0x0481}, // Ѡ ѡ (17) CYRILLIC CAPITAL LETTER OMEGA + {0x048A, 0x04BE, 0x04BF}, // Ҋ ҋ (27) CYRILLIC CAPITAL LETTER SHORT I WITH TAIL + {0x04C0, 0x04C0, 0x04CF}, // Ӏ ӏ ( 1) CYRILLIC LETTER PALOCHKA + {0x04C1, 0x04CD, 0x04CE}, // Ӂ ӂ ( 7) CYRILLIC CAPITAL LETTER ZHE WITH BREVE + {0x04D0, 0x052E, 0x052F}, // Ӑ ӑ (48) CYRILLIC CAPITAL LETTER A WITH BREVE + {0x0531, 0x0556, 0x0586}, // Ա ա (38) ARMENIAN CAPITAL LETTER AYB + {0x10A0, 0x10C5, 0x2D25}, // Ⴀ ⴀ (38) GEORGIAN CAPITAL LETTER AN + {0x10C7, 0x10C7, 0x2D27}, // Ⴧ ⴧ ( 1) GEORGIAN CAPITAL LETTER YN + {0x10CD, 0x10CD, 0x2D2D}, // Ⴭ ⴭ ( 1) GEORGIAN CAPITAL LETTER AEN + {0x13F8, 0x13FD, 0x13F5}, // ᏸ Ᏸ ( 6) CHEROKEE SMALL LETTER YE + {0x1C80, 0x1C80, 0x0432}, // ᲀ в ( 1) CYRILLIC SMALL LETTER ROUNDED VE + {0x1C81, 0x1C81, 0x0434}, // ᲁ д ( 1) CYRILLIC SMALL LETTER LONG-LEGGED DE + {0x1C82, 0x1C82, 0x043E}, // ᲂ о ( 1) CYRILLIC SMALL LETTER NARROW O + {0x1C83, 0x1C84, 0x0442}, // ᲃ с ( 2) CYRILLIC SMALL LETTER WIDE ES + {0x1C85, 0x1C85, 0x0442}, // ᲅ т ( 1) CYRILLIC SMALL LETTER THREE-LEGGED TE + {0x1C86, 0x1C86, 0x044A}, // ᲆ ъ ( 1) CYRILLIC SMALL LETTER TALL HARD SIGN + {0x1C87, 0x1C87, 0x0463}, // ᲇ ѣ ( 1) CYRILLIC SMALL LETTER TALL YAT + {0x1C88, 0x1C88, 0xA64B}, // ᲈ ꙋ ( 1) CYRILLIC SMALL LETTER UNBLENDED UK + {0x1C90, 0x1CBA, 0x10FA}, // Ა ა (43) GEORGIAN MTAVRULI CAPITAL LETTER AN + {0x1CBD, 0x1CBF, 0x10FF}, // Ჽ ჽ ( 3) GEORGIAN MTAVRULI CAPITAL LETTER AEN + {0x1E00, 0x1E94, 0x1E95}, // Ḁ ḁ (75) LATIN CAPITAL LETTER A WITH RING BELOW + {0x1E9B, 0x1E9B, 0x1E61}, // ẛ ṡ ( 1) LATIN SMALL LETTER LONG S WITH DOT ABOVE + {0x1E9E, 0x1E9E, 0x00DF}, // ẞ ß ( 1) LATIN CAPITAL LETTER SHARP S + {0x1EA0, 0x1EFE, 0x1EFF}, // Ạ ạ (48) LATIN CAPITAL LETTER A WITH DOT BELOW + {0x1F08, 0x1F0F, 0x1F07}, // Ἀ ἀ ( 8) GREEK CAPITAL LETTER ALPHA WITH PSILI + {0x1F18, 0x1F1D, 0x1F15}, // Ἐ ἐ ( 6) GREEK CAPITAL LETTER EPSILON WITH PSILI + {0x1F28, 0x1F2F, 0x1F27}, // Ἠ ἠ ( 8) GREEK CAPITAL LETTER ETA WITH PSILI + {0x1F38, 0x1F3F, 0x1F37}, // Ἰ ἰ ( 8) GREEK CAPITAL LETTER IOTA WITH PSILI + {0x1F48, 0x1F4D, 0x1F45}, // Ὀ ὀ ( 6) GREEK CAPITAL LETTER OMICRON WITH PSILI + {0x1F59, 0x1F5F, 0x1F57}, // Ὑ ὑ ( 7) GREEK CAPITAL LETTER UPSILON WITH DASIA + {0x1F68, 0x1F6F, 0x1F67}, // Ὠ ὠ ( 8) GREEK CAPITAL LETTER OMEGA WITH PSILI + {0x1F88, 0x1F8F, 0x1F87}, // ᾈ ᾀ ( 8) GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI + {0x1F98, 0x1F9F, 0x1F97}, // ᾘ ᾐ ( 8) GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI + {0x1FA8, 0x1FAF, 0x1FA7}, // ᾨ ᾠ ( 8) GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI + {0x1FB8, 0x1FB9, 0x1FB1}, // Ᾰ ᾰ ( 2) GREEK CAPITAL LETTER ALPHA WITH VRACHY + {0x1FBA, 0x1FBB, 0x1F71}, // Ὰ ὰ ( 2) GREEK CAPITAL LETTER ALPHA WITH VARIA + {0x1FBC, 0x1FBC, 0x1FB3}, // ᾼ ᾳ ( 1) GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + {0x1FBE, 0x1FBE, 0x03B9}, // ι ι ( 1) GREEK PROSGEGRAMMENI + {0x1FC8, 0x1FCB, 0x1F75}, // Ὲ ὲ ( 4) GREEK CAPITAL LETTER EPSILON WITH VARIA + {0x1FCC, 0x1FCC, 0x1FC3}, // ῌ ῃ ( 1) GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + {0x1FD8, 0x1FD9, 0x1FD1}, // Ῐ ῐ ( 2) GREEK CAPITAL LETTER IOTA WITH VRACHY + {0x1FDA, 0x1FDB, 0x1F77}, // Ὶ ὶ ( 2) GREEK CAPITAL LETTER IOTA WITH VARIA + {0x1FE8, 0x1FE9, 0x1FE1}, // Ῠ ῠ ( 2) GREEK CAPITAL LETTER UPSILON WITH VRACHY + {0x1FEA, 0x1FEB, 0x1F7B}, // Ὺ ὺ ( 2) GREEK CAPITAL LETTER UPSILON WITH VARIA + {0x1FEC, 0x1FEC, 0x1FE5}, // Ῥ ῥ ( 1) GREEK CAPITAL LETTER RHO WITH DASIA + {0x1FF8, 0x1FF9, 0x1F79}, // Ὸ ὸ ( 2) GREEK CAPITAL LETTER OMICRON WITH VARIA + {0x1FFA, 0x1FFB, 0x1F7D}, // Ὼ ὼ ( 2) GREEK CAPITAL LETTER OMEGA WITH VARIA + {0x1FFC, 0x1FFC, 0x1FF3}, // ῼ ῳ ( 1) GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + {0x2126, 0x2126, 0x03C9}, // Ω ω ( 1) OHM SIGN + {0x212A, 0x212A, 0x006B}, // K k ( 1) KELVIN SIGN + {0x212B, 0x212B, 0x00E5}, // Å å ( 1) ANGSTROM SIGN + {0x2132, 0x2132, 0x214E}, // Ⅎ ⅎ ( 1) TURNED CAPITAL F + {0x2160, 0x216F, 0x217F}, // Ⅰ ⅰ (16) ROMAN NUMERAL ONE + {0x2183, 0x2183, 0x2184}, // Ↄ ↄ ( 1) ROMAN NUMERAL REVERSED ONE HUNDRED + {0x24B6, 0x24CF, 0x24E9}, // Ⓐ ⓐ (26) CIRCLED LATIN CAPITAL LETTER A + {0x2C00, 0x2C2F, 0x2C5F}, // Ⰰ ⰰ (48) GLAGOLITIC CAPITAL LETTER AZU + {0x2C60, 0x2C60, 0x2C61}, // Ⱡ ⱡ ( 1) LATIN CAPITAL LETTER L WITH DOUBLE BAR + {0x2C62, 0x2C62, 0x026B}, // Ɫ ɫ ( 1) LATIN CAPITAL LETTER L WITH MIDDLE TILDE + {0x2C63, 0x2C63, 0x1D7D}, // Ᵽ ᵽ ( 1) LATIN CAPITAL LETTER P WITH STROKE + {0x2C64, 0x2C64, 0x027D}, // Ɽ ɽ ( 1) LATIN CAPITAL LETTER R WITH TAIL + {0x2C67, 0x2C6B, 0x2C6C}, // Ⱨ ⱨ ( 3) LATIN CAPITAL LETTER H WITH DESCENDER + {0x2C6D, 0x2C6D, 0x0251}, // Ɑ ɑ ( 1) LATIN CAPITAL LETTER ALPHA + {0x2C6E, 0x2C6E, 0x0271}, // Ɱ ɱ ( 1) LATIN CAPITAL LETTER M WITH HOOK + {0x2C6F, 0x2C6F, 0x0250}, // Ɐ ɐ ( 1) LATIN CAPITAL LETTER TURNED A + {0x2C70, 0x2C70, 0x0252}, // Ɒ ɒ ( 1) LATIN CAPITAL LETTER TURNED ALPHA + {0x2C72, 0x2C72, 0x2C73}, // Ⱳ ⱳ ( 1) LATIN CAPITAL LETTER W WITH HOOK + {0x2C75, 0x2C75, 0x2C76}, // Ⱶ ⱶ ( 1) LATIN CAPITAL LETTER HALF H + {0x2C7E, 0x2C7F, 0x0240}, // Ȿ ȿ ( 2) LATIN CAPITAL LETTER S WITH SWASH TAIL + {0x2C80, 0x2CE2, 0x2CE3}, // Ⲁ ⲁ (50) COPTIC CAPITAL LETTER ALFA + {0x2CEB, 0x2CED, 0x2CEE}, // Ⳬ ⳬ ( 2) COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI + {0x2CF2, 0x2CF2, 0x2CF3}, // Ⳳ ⳳ ( 1) COPTIC CAPITAL LETTER BOHAIRIC KHEI + {0xA640, 0xA66C, 0xA66D}, // Ꙁ ꙁ (23) CYRILLIC CAPITAL LETTER ZEMLYA + {0xA680, 0xA69A, 0xA69B}, // Ꚁ ꚁ (14) CYRILLIC CAPITAL LETTER DWE + {0xA722, 0xA72E, 0xA72F}, // Ꜣ ꜣ ( 7) LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF + {0xA732, 0xA76E, 0xA76F}, // Ꜳ ꜳ (31) LATIN CAPITAL LETTER AA + {0xA779, 0xA77B, 0xA77C}, // Ꝺ ꝺ ( 2) LATIN CAPITAL LETTER INSULAR D + {0xA77D, 0xA77D, 0x1D79}, // Ᵹ ᵹ ( 1) LATIN CAPITAL LETTER INSULAR G + {0xA77E, 0xA786, 0xA787}, // Ꝿ ꝿ ( 5) LATIN CAPITAL LETTER TURNED INSULAR G + {0xA78B, 0xA78B, 0xA78C}, // Ꞌ ꞌ ( 1) LATIN CAPITAL LETTER SALTILLO + {0xA78D, 0xA78D, 0x0265}, // Ɥ ɥ ( 1) LATIN CAPITAL LETTER TURNED H + {0xA790, 0xA792, 0xA793}, // Ꞑ ꞑ ( 2) LATIN CAPITAL LETTER N WITH DESCENDER + {0xA796, 0xA7A8, 0xA7A9}, // Ꞗ ꞗ (10) LATIN CAPITAL LETTER B WITH FLOURISH + {0xA7AA, 0xA7AA, 0x0266}, // Ɦ ɦ ( 1) LATIN CAPITAL LETTER H WITH HOOK + {0xA7AB, 0xA7AB, 0x025C}, // Ɜ ɜ ( 1) LATIN CAPITAL LETTER REVERSED OPEN E + {0xA7AC, 0xA7AC, 0x0261}, // Ɡ ɡ ( 1) LATIN CAPITAL LETTER SCRIPT G + {0xA7AD, 0xA7AD, 0x026C}, // Ɬ ɬ ( 1) LATIN CAPITAL LETTER L WITH BELT + {0xA7AE, 0xA7AE, 0x026A}, // Ɪ ɪ ( 1) LATIN CAPITAL LETTER SMALL CAPITAL I + {0xA7B0, 0xA7B0, 0x029E}, // Ʞ ʞ ( 1) LATIN CAPITAL LETTER TURNED K + {0xA7B1, 0xA7B1, 0x0287}, // Ʇ ʇ ( 1) LATIN CAPITAL LETTER TURNED T + {0xA7B2, 0xA7B2, 0x029D}, // Ʝ ʝ ( 1) LATIN CAPITAL LETTER J WITH CROSSED-TAIL + {0xA7B3, 0xA7B3, 0xAB53}, // Ꭓ ꭓ ( 1) LATIN CAPITAL LETTER CHI + {0xA7B4, 0xA7C2, 0xA7C3}, // Ꞵ ꞵ ( 8) LATIN CAPITAL LETTER BETA + {0xA7C4, 0xA7C4, 0xA794}, // Ꞔ ꞔ ( 1) LATIN CAPITAL LETTER C WITH PALATAL HOOK + {0xA7C5, 0xA7C5, 0x0282}, // Ʂ ʂ ( 1) LATIN CAPITAL LETTER S WITH HOOK + {0xA7C6, 0xA7C6, 0x1D8E}, // Ᶎ ᶎ ( 1) LATIN CAPITAL LETTER Z WITH PALATAL HOOK + {0xA7C7, 0xA7C9, 0xA7CA}, // Ꟈ ꟈ ( 2) LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY + {0xA7D0, 0xA7D0, 0xA7D1}, // Ꟑ ꟑ ( 1) LATIN CAPITAL LETTER CLOSED INSULAR G + {0xA7D6, 0xA7D8, 0xA7D9}, // Ꟗ ꟗ ( 2) LATIN CAPITAL LETTER MIDDLE SCOTS S + {0xA7F5, 0xA7F5, 0xA7F6}, // Ꟶ ꟶ ( 1) LATIN CAPITAL LETTER REVERSED HALF H + {0xAB70, 0xABBF, 0x13EF}, // ꭰ Ꭰ (80) CHEROKEE SMALL LETTER A + {0xFF21, 0xFF3A, 0xFF5A}, // A a (26) FULLWIDTH LATIN CAPITAL LETTER A + {0x0130, 0x0130, 0x0069}, // İ i ( 1) LATIN CAPITAL LETTER I WITH DOT ABOVE + {0x01CD, 0x01DB, 0x01DC}, // Ǎ ǎ ( 8) LATIN CAPITAL LETTER A WITH CARON + {0x01F4, 0x01F4, 0x01F5}, // Ǵ ǵ ( 1) LATIN CAPITAL LETTER G WITH ACUTE + {0x13A0, 0x13EF, 0xABBF}, // Ꭰ ꭰ (80) CHEROKEE LETTER A + {0x13F0, 0x13F5, 0x13FD}, // Ᏸ ᏸ ( 6) CHEROKEE LETTER YE + {0x039C, 0x039C, 0x00B5}, // Μ µ ( 1) + {0x0049, 0x0049, 0x0131}, // I ı ( 1) + {0x0053, 0x0053, 0x017F}, // S ſ ( 1) + {0x03A3, 0x03A3, 0x03C2}, // Σ ς ( 1) + {0x0392, 0x0392, 0x03D0}, // Β ϐ ( 1) + {0x0398, 0x0398, 0x03D1}, // Θ ϑ ( 1) + {0x03A6, 0x03A6, 0x03D5}, // Φ ϕ ( 1) + {0x03A0, 0x03A0, 0x03D6}, // Π ϖ ( 1) + {0x039A, 0x039A, 0x03F0}, // Κ ϰ ( 1) + {0x03A1, 0x03A1, 0x03F1}, // Ρ ϱ ( 1) + {0x0395, 0x0395, 0x03F5}, // Ε ϵ ( 1) + {0x0412, 0x0412, 0x1C80}, // В ᲀ ( 1) + {0x0414, 0x0414, 0x1C81}, // Д ᲁ ( 1) + {0x041E, 0x041E, 0x1C82}, // О ᲂ ( 1) + {0x0421, 0x0422, 0x1C84}, // С ᲃ ( 2) + {0x0422, 0x0422, 0x1C85}, // Т ᲅ ( 1) + {0x042A, 0x042A, 0x1C86}, // Ъ ᲆ ( 1) + {0x0462, 0x0462, 0x1C87}, // Ѣ ᲇ ( 1) + {0xA64A, 0xA64A, 0x1C88}, // Ꙋ ᲈ ( 1) + {0x1E60, 0x1E60, 0x1E9B}, // Ṡ ẛ ( 1) + {0x0399, 0x0399, 0x1FBE}, // Ι ι ( 1) +}; // 218 + +// Only the first 192 entries are used for casefolding. +enum { casefold_len = 192 }; + +// Indexes of uppercase runes in the casemappings table +// ordered by rune value, e.g.: 0x0041 (0), 0x00C0 (2), 0x00D8 (3), ... +static uint8_t upcase_ind[162] = { + 0, 2, 3, 4, 192, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 43, 45, 193, 47, 48, 194, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 80, + 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 195, 196, + 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 125, 126, 129, 131, 132, 133, + 134, 135, 136, 137, 139, 140, 141, 142, 144, 146, 147, 148, 149, 150, 151, 152, 153, 154, + 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, + 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 191, +}; + +// Indexes of lowercase runes in the casemappings table +// ordered by rune value, e.g.: 0x007A (0), 0x00B5 (197), 0x00DF (113), ... +static uint8_t lowcase_ind[184] = { + 0, 197, 113, 2, 3, 8, 4, 198, 5, 6, 7, 9, 199, 60, 12, 14, 16, 20, 50, + 25, 57, 53, 29, 31, 33, 35, 37, 39, 40, 51, 41, 43, 45, 193, 17, 47, 48, 194, + 52, 54, 56, 158, 59, 63, 154, 152, 155, 11, 13, 15, 18, 19, 174, 21, 175, 22, 170, + 173, 24, 23, 177, 148, 176, 26, 153, 27, 28, 150, 30, 184, 32, 179, 34, 61, 36, 62, + 38, 180, 178, 65, 66, 88, 68, 69, 72, 200, 73, 70, 71, 201, 202, 203, 204, 75, 80, + 205, 206, 86, 67, 207, 85, 87, 90, 89, 91, 92, 94, 93, 95, 96, 109, 110, 196, 208, + 209, 210, 211, 212, 213, 214, 215, 167, 149, 185, 111, 216, 114, 115, 116, 117, 118, 119, 120, + 121, 126, 129, 132, 136, 134, 137, 122, 123, 124, 125, 127, 217, 130, 131, 133, 135, 138, 142, + 144, 146, 147, 55, 58, 151, 156, 157, 159, 160, 161, 97, 98, 99, 162, 163, 164, 165, 166, + 168, 169, 171, 183, 172, 182, 186, 187, 188, 189, 181, 195, 191, +}; + +#endif // UTF8_TABLES_H diff --git a/libsql-ffi/bundled/sqlean/text/utf8/utf8.c b/libsql-ffi/bundled/sqlean/text/utf8/utf8.c new file mode 100644 index 0000000000..1e26f51fff --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/utf8.c @@ -0,0 +1,152 @@ +/* MIT License + * + * Copyright (c) 2023 Tyge Løvset + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// UTF-8 string handling. + +#include +#include +#include +#include + +#include "text/utf8/rune.h" +#include "text/utf8/utf8.h" + +const uint8_t utf8_dtab[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, + 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, + 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, + 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, +}; + +// Encode/decode functions. + +// utf8_decode decodes a byte as part of a utf8 codepoint. +uint32_t utf8_decode(utf8_decode_t* d, const uint32_t byte) { + const uint32_t type = utf8_dtab[byte]; + d->codep = d->state ? (byte & 0x3fu) | (d->codep << 6) : (0xffU >> type) & byte; + return d->state = utf8_dtab[256 + d->state + type]; +} + +// utf8_encode encodes the utf8 codepoint c to s +// and returns the number of bytes written. +int utf8_encode(char* out, uint32_t c) { + if (c < 0x80U) { + out[0] = (char)c; + return 1; + } else if (c < 0x0800U) { + out[0] = (char)((c >> 6 & 0x1F) | 0xC0); + out[1] = (char)((c & 0x3F) | 0x80); + return 2; + } else if (c < 0x010000U) { + if ((c < 0xD800U) | (c >= 0xE000U)) { + out[0] = (char)((c >> 12 & 0x0F) | 0xE0); + out[1] = (char)((c >> 6 & 0x3F) | 0x80); + out[2] = (char)((c & 0x3F) | 0x80); + return 3; + } + } else if (c < 0x110000U) { + out[0] = (char)((c >> 18 & 0x07) | 0xF0); + out[1] = (char)((c >> 12 & 0x3F) | 0x80); + out[2] = (char)((c >> 6 & 0x3F) | 0x80); + out[3] = (char)((c & 0x3F) | 0x80); + return 4; + } + return 0; +} + +// String functions. + +// utf8_at returns a pointer to the utf8 codepoint at index in s. +const char* utf8_at(const char* s, size_t n, size_t index) { + while ((index > 0) & (*s != 0) & (n-- != 0)) { + index -= (*++s & 0xC0) != 0x80; + } + return s; +} + +// utf8_pos returns the byte position of the utf8 codepoint at index in s. +size_t utf8_pos(const char* s, size_t n, size_t index) { + return (size_t)(utf8_at(s, n, index) - s); +} + +// utf8_len returns the number of utf8 codepoints in s. +size_t utf8_len(const char* s, size_t n) { + size_t size = 0; + while ((n-- != 0) & (*s != 0)) { + size += (*++s & 0xC0) != 0x80; + } + return size; +} + +// utf8_peek returns the utf8 codepoint at the start of s. +uint32_t utf8_peek(const char* s) { + utf8_decode_t d = {.state = 0}; + do { + utf8_decode(&d, (uint8_t)*s++); + } while (d.state); + return d.codep; +} + +// utf8_peek_at returns the utf8 codepoint at the index pos from s. +uint32_t utf8_peek_at(const char* s, size_t n, size_t pos) { + return utf8_peek(utf8_at(s, n, pos)); +} + +// utf8_icmp compares the utf8 strings s1 and s2 case-insensitively. +int utf8_icmp(const char* s1, size_t n1, const char* s2, size_t n2) { + utf8_decode_t d1 = {.state = 0}, d2 = {.state = 0}; + size_t j1 = 0, j2 = 0; + while ((j1 < n1) & (j2 < n2)) { + do { + utf8_decode(&d1, (uint8_t)s1[j1++]); + } while (d1.state); + do { + utf8_decode(&d2, (uint8_t)s2[j2++]); + } while (d2.state); + int32_t c = (int32_t)rune_casefold(d1.codep) - (int32_t)rune_casefold(d2.codep); + if (c || !s2[j2 - 1]) // OK if n1 and n2 are npos + return (int)c; + } + return (int)(n1 - n2); +} + +// utf8_valid returns true if s is a valid utf8 string. +bool utf8_valid(const char* s, size_t n) { + utf8_decode_t d = {.state = 0}; + while ((n-- != 0) & (*s != 0)) { + utf8_decode(&d, (uint8_t)*s++); + } + return d.state == 0; +} diff --git a/libsql-ffi/bundled/sqlean/text/utf8/utf8.h b/libsql-ffi/bundled/sqlean/text/utf8/utf8.h new file mode 100644 index 0000000000..fc2f9d078e --- /dev/null +++ b/libsql-ffi/bundled/sqlean/text/utf8/utf8.h @@ -0,0 +1,51 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// UTF-8 string handling. + +#ifndef UTF8_H +#define UTF8_H + +#include + +// decode next utf8 codepoint. +// See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +typedef struct { + uint32_t state, codep; +} utf8_decode_t; + +// utf8_decode decodes a byte as part of a utf8 codepoint. +uint32_t utf8_decode(utf8_decode_t* d, const uint32_t byte); +// utf8_encode encodes the utf8 codepoint c to s +// and returns the number of bytes written. +int utf8_encode(char* out, uint32_t c); + +// utf8_len returns the number of utf8 codepoints in s. +size_t utf8_len(const char* s, size_t n); + +// utf8_at returns a pointer to the utf8 codepoint at index in s. +const char* utf8_at(const char* s, size_t n, size_t index); +// utf8_pos returns the byte position of the utf8 codepoint at index in s. +size_t utf8_pos(const char* s, size_t n, size_t index); + +// utf8_peek returns the utf8 codepoint at the start of s. +uint32_t utf8_peek(const char* s); +// utf8_peek_at returns the utf8 codepoint at the index pos from s. +uint32_t utf8_peek_at(const char* s, size_t n, size_t pos); + +// utf8_icmp compares the utf8 strings s1 and s2 case-insensitively. +int utf8_icmp(const char* s1, size_t n1, const char* s2, size_t n2); + +// utf8_valid returns true if s is a valid utf8 string. +bool utf8_valid(const char* s, size_t n); + +// utf8_tolower converts the utf8 string s to lowercase. +bool utf8_tolower(char* s, size_t n); +// utf8_toupper converts the utf8 string s to uppercase. +bool utf8_toupper(char* s, size_t n); +// utf8_totitle converts the utf8 string s to title-case. +bool utf8_totitle(char* s, size_t n); +// utf8_casefold converts the utf8 string s to folded-case. +bool utf8_casefold(char* s, size_t n); + +#endif // UTF8_H diff --git a/libsql-ffi/bundled/sqlean/time/duration.c b/libsql-ffi/bundled/sqlean/time/duration.c new file mode 100644 index 0000000000..802d418fb6 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/time/duration.c @@ -0,0 +1,117 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Based on Go's time package, BSD 3-Clause License +// https://github.com/golang/go + +// Duration methods. + +#include +#include +#include +#include "time/timex.h" + +// Common durations. +const Duration Nanosecond = 1; +const Duration Microsecond = 1000 * Nanosecond; +const Duration Millisecond = 1000 * Microsecond; +const Duration Second = 1000 * Millisecond; +const Duration Minute = 60 * Second; +const Duration Hour = 60 * Minute; + +#pragma region Conversion + +// dur_to_micro returns the duration as an integer microsecond count. +int64_t dur_to_micro(Duration d) { + return d / Microsecond; +} + +// dur_to_milli returns the duration as an integer millisecond count. +int64_t dur_to_milli(Duration d) { + return d / Millisecond; +} + +// dur_to_seconds returns the duration as a floating point number of seconds. +double dur_to_seconds(Duration d) { + int64_t sec = d / Second; + int64_t nsec = d % Second; + return (double)sec + (double)nsec / 1e9; +} + +// dur_to_minutes returns the duration as a floating point number of minutes. +double dur_to_minutes(Duration d) { + int64_t min = d / Minute; + int64_t nsec = d % Minute; + return (double)min + (double)nsec / (60 * 1e9); +} + +// dur_to_hours returns the duration as a floating point number of hours. +double dur_to_hours(Duration d) { + int64_t hour = d / Hour; + int64_t nsec = d % Hour; + return (double)hour + (double)nsec / (60 * 60 * 1e9); +} + +#pragma endregion + +#pragma region Rounding + +// dless_than_half reports whether x+x < y but avoids overflow, +// assuming x and y are both positive (Duration is signed). +static bool dless_than_half(Duration x, Duration y) { + return (uint64_t)x + (uint64_t)x < (uint64_t)y; +} + +// dur_truncate returns the result of rounding d toward zero to a multiple of m. +// If m <= 0, Truncate returns d unchanged. +Duration dur_truncate(Duration d, Duration m) { + if (m <= 0) { + return d; + } + return d - d % m; +} + +// dur_round returns the result of rounding d to the nearest multiple of m. +// The rounding behavior for halfway values is to round away from zero. +// If the result exceeds the maximum (or minimum) +// value that can be stored in a Duration, +// Round returns the maximum (or minimum) duration. +// If m <= 0, Round returns d unchanged. +Duration dur_round(Duration d, Duration m) { + if (m <= 0) { + return d; + } + int64_t r = d % m; + + if (d < 0) { + r = -r; + if (dless_than_half(r, m)) { + return d + r; + } + int64_t d1 = d - m + r; + if (d1 < d) { + return d1; + } + return MIN_DURATION; // overflow + } + + if (dless_than_half(r, m)) { + return d - r; + } + int64_t d1 = d + m - r; + if (d1 > d) { + return d1; + } + return MAX_DURATION; // overflow +} + +// dur_abs returns the absolute value of d. +// As a special case, MIN_DURATION is converted to MAX_DURATION. +Duration dur_abs(Duration d) { + if (d == MIN_DURATION) { + return MAX_DURATION; + } + return d < 0 ? -d : d; +} + +#pragma endregion diff --git a/libsql-ffi/bundled/sqlean/time/extension.c b/libsql-ffi/bundled/sqlean/time/extension.c new file mode 100644 index 0000000000..676c042a5b --- /dev/null +++ b/libsql-ffi/bundled/sqlean/time/extension.c @@ -0,0 +1,798 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with time. + +#include +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "time/timex.h" + +// result_blob converts a Time value to a blob and sets it as the result. +static void result_blob(sqlite3_context* context, Time t) { + uint8_t buf[TIMEX_BLOB_SIZE]; + time_to_blob(t, buf); + sqlite3_result_blob(context, buf, sizeof(buf), SQLITE_TRANSIENT); +} + +// time_now() +static void fn_now(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 0); + Time t = time_now(); + result_blob(context, t); +} + +// time_date(year, month, day[, hour, min, sec[, nsec[, offset_sec]]]) +static void fn_date(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 3 || argc == 6 || argc == 7 || argc == 8); + for (int i = 0; i < argc; i++) { + if (sqlite3_value_type(argv[i]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "all parameters should be integers", -1); + return; + } + } + int year = sqlite3_value_int(argv[0]); + int month = sqlite3_value_int(argv[1]); + int day = sqlite3_value_int(argv[2]); + + int hour = 0; + int min = 0; + int sec = 0; + if (argc >= 6) { + hour = sqlite3_value_int(argv[3]); + min = sqlite3_value_int(argv[4]); + sec = sqlite3_value_int(argv[5]); + } + + int nsec = 0; + if (argc >= 7) { + nsec = sqlite3_value_int(argv[6]); + } + + int offset_sec = 0; + if (argc == 8) { + offset_sec = sqlite3_value_int(argv[7]); + } + + Time t = time_date(year, month, day, hour, min, sec, nsec, offset_sec); + result_blob(context, t); +} + +// time_get_year(t) +// time_get_month(t) +// time_get_day(t) +// time_get_hour(t) +// time_get_minute(t) +// time_get_second(t) +// time_get_nano(t) +// time_get_weekday(t) +// time_get_yearday(t) +static void fn_extract(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "parameter should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "invalid time blob size", -1); + return; + } + int (*extract)(Time t) = (int (*)(Time t))sqlite3_user_data(context); + Time t = time_blob(sqlite3_value_blob(argv[0])); + sqlite3_result_int(context, extract(t)); +} + +// time_get_isoyear(t) +static void fn_get_isoyear(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "parameter should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + int year, week; + time_get_isoweek(t, &year, &week); + sqlite3_result_int(context, year); +} + +// time_get_isoweek(t) +static void fn_get_isoweek(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "parameter should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + int year, week; + time_get_isoweek(t, &year, &week); + sqlite3_result_int(context, week); +} + +// get_field returns a part of the t according to a given field +static void get_field(sqlite3_context* context, Time t, const char* field) { + // millennium, century, decade + if (strcmp(field, "millennium") == 0) { + int millennium = time_get_year(t) / 1000; + sqlite3_result_int(context, millennium); + return; + } + if (strcmp(field, "century") == 0) { + int century = time_get_year(t) / 100; + sqlite3_result_int(context, century); + return; + } + if (strncmp(field, "decade", 6) == 0) { + int decade = time_get_year(t) / 10; + sqlite3_result_int(context, decade); + return; + } + + // year, quarter, month, day + if (strcmp(field, "year") == 0 || strcmp(field, "years") == 0) { + sqlite3_result_int(context, time_get_year(t)); + return; + } + if (strncmp(field, "quarter", 7) == 0) { + int quarter = (time_get_month(t) - 1) / 3 + 1; + sqlite3_result_int(context, quarter); + return; + } + if (strncmp(field, "month", 5) == 0) { + sqlite3_result_int(context, time_get_month(t)); + return; + } + if (strcmp(field, "day") == 0 || strcmp(field, "days") == 0) { + sqlite3_result_int(context, time_get_day(t)); + return; + } + + // hour, minute, second + if (strncmp(field, "hour", 4) == 0) { + sqlite3_result_int(context, time_get_hour(t)); + return; + } + if (strncmp(field, "minute", 6) == 0) { + sqlite3_result_int(context, time_get_minute(t)); + return; + } + if (strncmp(field, "second", 6) == 0) { + // including fractional part + double sec = time_get_second(t) + t.nsec / 1e9; + sqlite3_result_double(context, sec); + return; + } + + // millisecond, microsecond, nanosecond + if (strncmp(field, "milli", 5) == 0) { + int msec = time_get_nano(t) / 1000000; + sqlite3_result_int(context, msec); + return; + } + if (strncmp(field, "micro", 5) == 0) { + int usec = time_get_nano(t) / 1000; + sqlite3_result_int(context, usec); + return; + } + if (strncmp(field, "nano", 4) == 0) { + sqlite3_result_int(context, time_get_nano(t)); + return; + } + + // isoyear, isoweek, isodow, yearday, weekday + if (strcmp(field, "isoyear") == 0) { + int year, week; + time_get_isoweek(t, &year, &week); + sqlite3_result_int(context, year); + return; + } + if (strcmp(field, "isoweek") == 0 || strcmp(field, "week") == 0) { + int year, week; + time_get_isoweek(t, &year, &week); + sqlite3_result_int(context, week); + return; + } + if (strcmp(field, "isodow") == 0) { + int isodow = time_get_weekday(t) == 0 ? 7 : time_get_weekday(t); + sqlite3_result_int(context, isodow); + return; + } + if (strcmp(field, "yearday") == 0 || strcmp(field, "doy") == 0 || + strcmp(field, "dayofyear") == 0) { + sqlite3_result_int(context, time_get_yearday(t)); + return; + } + if (strcmp(field, "weekday") == 0 || strcmp(field, "dow") == 0 || + strcmp(field, "dayofweek") == 0) { + sqlite3_result_int(context, time_get_weekday(t)); + return; + } + + // epoch + if (strcmp(field, "epoch") == 0) { + // including fractional part + double epoch = time_to_unix(t) + t.nsec / 1e9; + sqlite3_result_double(context, epoch); + return; + } + + sqlite3_result_error(context, "unknown field", -1); +} + +// time_get(t, field) +static void fn_get(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + if (sqlite3_value_type(argv[1]) != SQLITE_TEXT) { + sqlite3_result_error(context, "2nd parameter: should be a field name", -1); + return; + } + const char* field = (const char*)sqlite3_value_text(argv[1]); + + get_field(context, t, field); +} + +// date_part(field, t) +// Postgres-compatible. +static void date_part(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + if (sqlite3_value_type(argv[0]) != SQLITE_TEXT) { + sqlite3_result_error(context, "1st parameter: should be a field name", -1); + return; + } + const char* field = (const char*)sqlite3_value_text(argv[0]); + + if (sqlite3_value_type(argv[1]) != SQLITE_BLOB) { + sqlite3_result_error(context, "2nd parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[1]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "2nd parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[1])); + + get_field(context, t, field); +} + +// time_unix(sec[, nsec]) +static void fn_unix(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1 || argc == 2); + for (int i = 0; i < argc; i++) { + if (sqlite3_value_type(argv[i]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "all parameters should be integers", -1); + return; + } + } + + int64_t sec = sqlite3_value_int64(argv[0]); + int64_t nsec = 0; + if (argc == 2) { + nsec = sqlite3_value_int64(argv[1]); + } + + Time t = time_unix(sec, nsec); + result_blob(context, t); +} + +// time_milli(msec) +// time_micro(usec) +// time_nano(nsec) +static void fn_unix_n(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "parameter should be an integer", -1); + return; + } + int64_t n = sqlite3_value_int64(argv[0]); + Time (*convert)(int64_t n) = (Time(*)(int64_t))sqlite3_user_data(context); + Time t = convert(n); + result_blob(context, t); +} + +// time_to_unix(t) +// time_to_milli(t) +// time_to_micro(t) +// time_to_nano(t) +static void fn_convert(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "parameter should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "invalid time blob size", -1); + return; + } + int64_t (*convert)(Time t) = (int64_t(*)(Time t))sqlite3_user_data(context); + Time t = time_blob(sqlite3_value_blob(argv[0])); + sqlite3_result_int64(context, convert(t)); +} + +// time_after(t, u) +// time_before(t, u) +// time_compare(t, u) +// time_equal(t, u) +static void fn_compare(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + if (sqlite3_value_type(argv[1]) != SQLITE_BLOB) { + sqlite3_result_error(context, "2nd parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[1]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "2nd parameter: invalid time blob size", -1); + return; + } + Time u = time_blob(sqlite3_value_blob(argv[1])); + + int (*compare)(Time t, Time u) = (int (*)(Time, Time))sqlite3_user_data(context); + sqlite3_result_int(context, compare(t, u)); +} + +// time_add(t, d) +static void fn_add(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "2nd parameter: should be an integer", -1); + return; + } + Duration d = sqlite3_value_int64(argv[1]); + + Time r = time_add(t, d); + result_blob(context, r); +} + +// time_sub(t, u) +static void fn_sub(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + if (sqlite3_value_type(argv[1]) != SQLITE_BLOB) { + sqlite3_result_error(context, "2nd parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[1]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "2nd parameter: invalid time blob size", -1); + return; + } + Time u = time_blob(sqlite3_value_blob(argv[1])); + + Duration d = time_sub(t, u); + sqlite3_result_int64(context, d); +} + +// time_since(t) +static void fn_since(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "parameter should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + Duration d = time_since(t); + sqlite3_result_int64(context, d); +} + +// time_until(t) +static void fn_until(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "parameter should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + Duration d = time_until(t); + sqlite3_result_int64(context, d); +} + +// time_add_date(t, years[, months[, days]]) +static void fn_add_date(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2 || argc == 3 || argc == 4); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "2nd parameter: should be an integer", -1); + return; + } + int years = sqlite3_value_int(argv[1]); + + int months = 0; + if (argc >= 3) { + if (sqlite3_value_type(argv[2]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "3rd parameter: should be an integer", -1); + return; + } + months = sqlite3_value_int(argv[2]); + } + + int days = 0; + if (argc == 4) { + if (sqlite3_value_type(argv[3]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "4th parameter: should be an integer", -1); + return; + } + days = sqlite3_value_int(argv[3]); + } + + Time r = time_add_date(t, years, months, days); + result_blob(context, r); +} + +// trunc_field truncates t according to a given field +static void trunc_field(sqlite3_context* context, Time t, const char* field) { + // millennium, century, decade + if (strcmp(field, "millennium") == 0) { + int year = time_get_year(t); + int millennium = year / 1000 * 1000; + Time r = time_date(millennium, January, 1, 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + if (strcmp(field, "century") == 0) { + int year = time_get_year(t); + int century = year / 100 * 100; + Time r = time_date(century, January, 1, 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + if (strcmp(field, "decade") == 0) { + int year = time_get_year(t); + int decade = year / 10 * 10; + Time r = time_date(decade, January, 1, 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + + // year, quarter, month, week, day + if (strcmp(field, "year") == 0) { + Time r = time_date(time_get_year(t), January, 1, 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + if (strcmp(field, "quarter") == 0) { + int quarter = (time_get_month(t) - 1) / 3; + Time r = time_date(time_get_year(t), quarter * 3 + 1, 1, 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + if (strcmp(field, "month") == 0) { + Time r = time_date(time_get_year(t), time_get_month(t), 1, 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + if (strcmp(field, "week") == 0) { + int year, week; + time_get_isoweek(t, &year, &week); + Time r = time_date(year, January, 1, 0, 0, 0, 0, 0); + r = time_add_date(r, 0, 0, (week - 1) * 7); + result_blob(context, r); + return; + } + if (strcmp(field, "day") == 0) { + Time r = + time_date(time_get_year(t), time_get_month(t), time_get_day(t), 0, 0, 0, 0, TIMEX_UTC); + result_blob(context, r); + return; + } + + // hour, minute, second, millisecond, microsecond + if (strcmp(field, "hour") == 0) { + Time r = time_truncate(t, Hour); + result_blob(context, r); + return; + } + if (strcmp(field, "minute") == 0) { + Time r = time_truncate(t, Minute); + result_blob(context, r); + return; + } + if (strcmp(field, "second") == 0) { + Time r = time_truncate(t, Second); + result_blob(context, r); + return; + } + if (strncmp(field, "milli", 5) == 0) { + int64_t nsec = (t.nsec / 1000000) * 1000000; + Time r = time_unix(time_to_unix(t), nsec); + result_blob(context, r); + return; + } + if (strncmp(field, "micro", 5) == 0) { + int64_t nsec = (t.nsec / 1000) * 1000; + Time r = time_unix(time_to_unix(t), nsec); + result_blob(context, r); + return; + } + + sqlite3_result_error(context, "unknown field", -1); +} + +// time_trunc(t, field) +// time_trunc(t, d) +static void fn_trunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + // first parameter is a time blob + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + // second parameter can be a custom duration + if (sqlite3_value_type(argv[1]) == SQLITE_INTEGER) { + // truncate to custom duration + Duration d = sqlite3_value_int64(argv[1]); + Time r = time_truncate(t, d); + result_blob(context, r); + return; + } + + // or a field name + if (sqlite3_value_type(argv[1]) != SQLITE_TEXT) { + sqlite3_result_error(context, "2nd parameter: should be a field name", -1); + return; + } + const char* field = (const char*)sqlite3_value_text(argv[1]); + + // truncate to field + trunc_field(context, t, field); +} + +// date_trunc(field, t) +// Postgres-compatible. +static void date_trunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + // first parameter is a field name + if (sqlite3_value_type(argv[0]) != SQLITE_TEXT) { + sqlite3_result_error(context, "1st parameter: should be a field name", -1); + return; + } + const char* field = (const char*)sqlite3_value_text(argv[0]); + + // second parameter is a time blob + if (sqlite3_value_type(argv[1]) != SQLITE_BLOB) { + sqlite3_result_error(context, "2nd parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[1]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "2nd parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[1])); + + trunc_field(context, t, field); +} + +// time_round(t, d) +static void fn_round(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 2); + + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "2nd parameter: should be an integer", -1); + return; + } + Duration d = sqlite3_value_int64(argv[1]); + + Time r = time_round(t, d); + result_blob(context, r); +} + +// time_fmt_iso(t[, offset_sec]) +// time_fmt_datetime(t[, offset_sec]) +// time_fmt_date(t[, offset_sec]) +// time_fmt_time(t[, offset_sec]) +static void fn_format(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1 || argc == 2); + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB) { + sqlite3_result_error(context, "1st parameter: should be a time blob", -1); + return; + } + if (sqlite3_value_bytes(argv[0]) != TIMEX_BLOB_SIZE) { + sqlite3_result_error(context, "1st parameter: invalid time blob size", -1); + return; + } + Time t = time_blob(sqlite3_value_blob(argv[0])); + + int offset_sec = 0; + if (argc == 2) { + if (sqlite3_value_type(argv[1]) != SQLITE_INTEGER) { + sqlite3_result_error(context, "2nd parameter: should be an integer", -1); + return; + } + offset_sec = sqlite3_value_int(argv[1]); + } + + char buf[36]; + size_t (*format)(char* buf, size_t size, Time t, int offset_sec) = + (size_t(*)(char*, size_t, Time, int))sqlite3_user_data(context); + format(buf, sizeof(buf), t, offset_sec); + sqlite3_result_text(context, buf, -1, SQLITE_TRANSIENT); +} + +// time_parse(v) +static void fn_parse(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 1); + const char* val = (const char*)sqlite3_value_text(argv[0]); + Time t = time_parse(val); + result_blob(context, t); +} + +// dur_h(), dur_m(), dur_s(), dur_ms(), dur_us(), dur_ns() +static void fn_dur_const(sqlite3_context* context, int argc, sqlite3_value** argv) { + assert(argc == 0); + int64_t d = (intptr_t)sqlite3_user_data(context); + sqlite3_result_int64(context, d); +} + +int time_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + static const int flags_nd = SQLITE_UTF8 | SQLITE_INNOCUOUS; + + // constructors + sqlite3_create_function(db, "time_now", 0, flags_nd, 0, fn_now, 0, 0); + sqlite3_create_function(db, "time_date", 3, flags, 0, fn_date, 0, 0); + sqlite3_create_function(db, "time_date", 6, flags, 0, fn_date, 0, 0); + sqlite3_create_function(db, "time_date", 7, flags, 0, fn_date, 0, 0); + sqlite3_create_function(db, "time_date", 8, flags, 0, fn_date, 0, 0); + + // time parts + sqlite3_create_function(db, "time_get_year", 1, flags, time_get_year, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_month", 1, flags, time_get_month, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_day", 1, flags, time_get_day, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_hour", 1, flags, time_get_hour, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_minute", 1, flags, time_get_minute, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_second", 1, flags, time_get_second, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_nano", 1, flags, time_get_nano, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_weekday", 1, flags, time_get_weekday, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_yearday", 1, flags, time_get_yearday, fn_extract, 0, 0); + sqlite3_create_function(db, "time_get_isoyear", 1, flags, 0, fn_get_isoyear, 0, 0); + sqlite3_create_function(db, "time_get_isoweek", 1, flags, 0, fn_get_isoweek, 0, 0); + sqlite3_create_function(db, "time_get", 2, flags, 0, fn_get, 0, 0); + + // unix time + sqlite3_create_function(db, "time_unix", 1, flags, 0, fn_unix, 0, 0); + sqlite3_create_function(db, "time_unix", 2, flags, 0, fn_unix, 0, 0); + sqlite3_create_function(db, "time_milli", 1, flags, time_milli, fn_unix_n, 0, 0); + sqlite3_create_function(db, "time_micro", 1, flags, time_micro, fn_unix_n, 0, 0); + sqlite3_create_function(db, "time_nano", 1, flags, time_nano, fn_unix_n, 0, 0); + sqlite3_create_function(db, "time_to_unix", 1, flags, time_to_unix, fn_convert, 0, 0); + sqlite3_create_function(db, "time_to_milli", 1, flags, time_to_milli, fn_convert, 0, 0); + sqlite3_create_function(db, "time_to_micro", 1, flags, time_to_micro, fn_convert, 0, 0); + sqlite3_create_function(db, "time_to_nano", 1, flags, time_to_nano, fn_convert, 0, 0); + + // comparison + sqlite3_create_function(db, "time_after", 2, flags, time_after, fn_compare, 0, 0); + sqlite3_create_function(db, "time_before", 2, flags, time_before, fn_compare, 0, 0); + sqlite3_create_function(db, "time_compare", 2, flags, time_compare, fn_compare, 0, 0); + sqlite3_create_function(db, "time_equal", 2, flags, time_equal, fn_compare, 0, 0); + + // arithmetic + sqlite3_create_function(db, "time_add", 2, flags, 0, fn_add, 0, 0); + sqlite3_create_function(db, "time_sub", 2, flags, 0, fn_sub, 0, 0); + sqlite3_create_function(db, "time_since", 1, flags_nd, 0, fn_since, 0, 0); + sqlite3_create_function(db, "time_until", 1, flags_nd, 0, fn_until, 0, 0); + sqlite3_create_function(db, "time_add_date", 2, flags, 0, fn_add_date, 0, 0); + sqlite3_create_function(db, "time_add_date", 3, flags, 0, fn_add_date, 0, 0); + sqlite3_create_function(db, "time_add_date", 4, flags, 0, fn_add_date, 0, 0); + + // rounding + sqlite3_create_function(db, "time_trunc", 2, flags, 0, fn_trunc, 0, 0); + sqlite3_create_function(db, "time_round", 2, flags, 0, fn_round, 0, 0); + + // formatting + sqlite3_create_function(db, "time_fmt_iso", 1, flags, time_fmt_iso, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_iso", 2, flags, time_fmt_iso, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_datetime", 1, flags, time_fmt_datetime, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_datetime", 2, flags, time_fmt_datetime, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_date", 1, flags, time_fmt_date, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_date", 2, flags, time_fmt_date, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_time", 1, flags, time_fmt_time, fn_format, 0, 0); + sqlite3_create_function(db, "time_fmt_time", 2, flags, time_fmt_time, fn_format, 0, 0); + sqlite3_create_function(db, "time_parse", 1, flags, 0, fn_parse, 0, 0); + + // duration constants + sqlite3_create_function(db, "dur_h", 0, flags, (void*)Hour, fn_dur_const, 0, 0); + sqlite3_create_function(db, "dur_m", 0, flags, (void*)Minute, fn_dur_const, 0, 0); + sqlite3_create_function(db, "dur_s", 0, flags, (void*)Second, fn_dur_const, 0, 0); + sqlite3_create_function(db, "dur_ms", 0, flags, (void*)Millisecond, fn_dur_const, 0, 0); + sqlite3_create_function(db, "dur_us", 0, flags, (void*)Microsecond, fn_dur_const, 0, 0); + sqlite3_create_function(db, "dur_ns", 0, flags, (void*)Nanosecond, fn_dur_const, 0, 0); + + // postgres compatibility layer + sqlite3_create_function(db, "age", 2, flags, 0, fn_sub, 0, 0); + sqlite3_create_function(db, "date_add", 2, flags, 0, fn_add, 0, 0); + sqlite3_create_function(db, "date_part", 2, flags, 0, date_part, 0, 0); + sqlite3_create_function(db, "date_trunc", 2, flags, 0, date_trunc, 0, 0); + sqlite3_create_function(db, "make_date", 3, flags, 0, fn_date, 0, 0); + sqlite3_create_function(db, "make_timestamp", 6, flags, 0, fn_date, 0, 0); + sqlite3_create_function(db, "now", 0, flags_nd, 0, fn_now, 0, 0); + sqlite3_create_function(db, "to_timestamp", 1, flags, 0, fn_unix, 0, 0); + + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/time/extension.h b/libsql-ffi/bundled/sqlean/time/extension.h new file mode 100644 index 0000000000..53de529f54 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/time/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// SQLite extension for working with time. + +#ifndef TIME_EXTENSION_H +#define TIME_EXTENSION_H + +#include "sqlite3ext.h" + +int time_init(sqlite3* db); + +#endif /* TIME_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/time/time.c b/libsql-ffi/bundled/sqlean/time/time.c new file mode 100644 index 0000000000..4dce4b2885 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/time/time.c @@ -0,0 +1,875 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Based on Go's time package, BSD 3-Clause License +// https://github.com/golang/go + +// Time functions and methods. + +#include +#include +#include +#include +#include +#include "time/timex.h" + +#pragma region Private + +static const int64_t seconds_per_minute = 60; +static const int64_t seconds_per_hour = 60 * seconds_per_minute; +static const int64_t seconds_per_day = 24 * seconds_per_hour; +static const int64_t seconds_per_week = 7 * seconds_per_day; +static const int64_t days_per_400_years = 365 * 400 + 97; +static const int64_t days_per_100_years = 365 * 100 + 24; +static const int64_t days_per_4_years = 365 * 4 + 1; + +// The unsigned zero year for internal calculations. +// Must be 1 mod 400, and times before it will not compute correctly, +// but otherwise can be changed at will. +static const int64_t absolute_zero_year = -292277022399LL; + +// Offsets to convert between internal and absolute or Unix times. +// = (absoluteZeroYear - internalYear) * 365.2425 * secondsPerDay +static const int64_t absolute_to_internal = -9223371966579724800LL; +static const int64_t internal_to_absolute = -absolute_to_internal; + +static const int64_t unix_to_internal = + (1969 * 365 + 1969 / 4 - 1969 / 100 + 1969 / 400) * seconds_per_day; +static const int64_t internal_to_unix = -unix_to_internal; + +// days_before[m] counts the number of days in a non-leap year +// before month m begins. There is an entry for m=12, counting +// the number of days before January of next year (365). +static const int days_before[] = { + 0, + 31, + 31 + 28, + 31 + 28 + 31, + 31 + 28 + 31 + 30, + 31 + 28 + 31 + 30 + 31, + 31 + 28 + 31 + 30 + 31 + 30, + 31 + 28 + 31 + 30 + 31 + 30 + 31, + 31 + 28 + 31 + 30 + 31 + 30 + 31 + 31, + 31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30, + 31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31, + 31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31 + 30, + 31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31 + 30 + 31, +}; + +// norm returns nhi, nlo such that +// +// hi * base + lo == nhi * base + nlo +// 0 <= nlo < base +static void norm(int hi, int lo, int base, int* nhi, int* nlo) { + if (lo < 0) { + int n = (-lo - 1) / base + 1; + hi -= n; + lo += n * base; + } + if (lo >= base) { + int n = lo / base; + hi += n; + lo -= n * base; + } + *nhi = hi; + *nlo = lo; +} + +// days_since_epoch takes a year and returns the number of days from +// the absolute epoch to the start of that year. +// This is basically (year - zeroYear) * 365, but accounting for leap days. +static uint64_t days_since_epoch(int year) { + uint64_t y = year - absolute_zero_year; + + // Add in days from 400-year cycles. + uint64_t n = y / 400; + y -= 400 * n; + uint64_t d = days_per_400_years * n; + + // Add in 100-year cycles. + n = y / 100; + y -= 100 * n; + d += days_per_100_years * n; + + // Add in 4-year cycles. + n = y / 4; + y -= 4 * n; + d += days_per_4_years * n; + + // Add in non-leap years. + n = y; + d += 365 * n; + + return d; +} + +// is_leap reports whether the year is a leap year. +static bool is_leap(int year) { + return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0); +} + +static int64_t unix_sec(Time t) { + return t.sec + internal_to_unix; +} + +static Time unix_time(int64_t sec, int32_t nsec) { + return (Time){sec + unix_to_internal, nsec}; +} + +// abs_time returns the time t as an absolute time, adjusted by the zone offset. +// It is called when computing a presentation property like Month or Hour. +static uint64_t abs_time(Time t) { + return t.sec + internal_to_absolute; +} + +// abs_weekday is like Weekday but operates on an absolute time. +static enum Weekday abs_weekday(uint64_t abs) { + // January 1 of the absolute year, like January 1 of 2001, was a Monday. + uint64_t sec = (abs + Monday * seconds_per_day) % seconds_per_week; + return sec / seconds_per_day; +} + +static void abs_date(uint64_t abs, int* year, int* yday) { + // Split into time and day. + uint64_t d = abs / seconds_per_day; + + // Account for 400 year cycles. + uint64_t n = d / days_per_400_years; + uint64_t y = 400 * n; + d -= days_per_400_years * n; + + // Cut off 100-year cycles. + // The last cycle has one extra leap year, so on the last day + // of that year, day / days_per_100_years will be 4 instead of 3. + // Cut it back down to 3 by subtracting n>>2. + n = d / days_per_100_years; + n -= n >> 2; + y += 100 * n; + d -= days_per_100_years * n; + + // Cut off 4-year cycles. + // The last cycle has a missing leap year, which does not + // affect the computation. + n = d / days_per_4_years; + y += 4 * n; + d -= days_per_4_years * n; + + // Cut off years within a 4-year cycle. + // The last year is a leap year, so on the last day of that year, + // day / 365 will be 4 instead of 3. Cut it back down to 3 + // by subtracting n>>2. + n = d / 365; + n -= n >> 2; + y += n; + d -= 365 * n; + + *year = y + absolute_zero_year; + *yday = d; +} + +static void abs_date_full(uint64_t abs, int* year, enum Month* month, int* day, int* yday) { + abs_date(abs, year, yday); + + *day = *yday; + if (is_leap(*year)) { + // Leap year + if (*day > 31 + 29 - 1) { + // After leap day; pretend it wasn't there. + *day -= 1; + } + if (*day == 31 + 29 - 1) { + // Leap day. + *month = February; + *day = 29; + return; + } + } + + // Estimate month on assumption that every month has 31 days. + // The estimate may be too low by at most one month, so adjust. + *month = *day / 31; + int end = days_before[(int)(*month) + 1]; + int begin; + if (*day >= end) { + *month += 1; + begin = end; + } else { + begin = days_before[(int)(*month)]; + } + + *month += 1; // because January is 1 + *day = *day - begin + 1; +} + +void abs_clock(uint64_t abs, int* hour, int* min, int* sec) { + *sec = abs % seconds_per_day; + *hour = *sec / seconds_per_hour; + *sec -= *hour * seconds_per_hour; + *min = *sec / seconds_per_minute; + *sec -= *min * seconds_per_minute; +} + +// tless_than_half reports whether x+x < y but avoids overflow, +// assuming x and y are both positive (Duration is signed). +static bool tless_than_half(Duration x, Duration y) { + return (uint64_t)x + (uint64_t)x < (uint64_t)y; +} + +// time_div divides t by d and returns the remainder. +// Only supports d which is a multiple of 1 second. +static Duration time_div(Time t, Duration d) { + if (d % Second != 0) { + return 0; + } + + bool neg = false; + int64_t sec = t.sec; + int64_t nsec = t.nsec; + if (sec < 0) { + // Operate on absolute value. + neg = true; + sec = -sec; + nsec = -nsec; + if (nsec < 0) { + nsec += 1e9; + sec--; // sec >= 1 before the -- so safe + } + } + + // d is a multiple of 1 second. + int64_t d1 = d / Second; + Duration r = (sec % d1) * Second + nsec; + + if (neg && r != 0) { + r = d - r; + } + return r; +} + +#pragma endregion + +#pragma region Constructors + +// time_now returns the current time in UTC. +Time time_now(void) { + struct timespec ts; + timespec_get(&ts, TIME_UTC); + return unix_time(ts.tv_sec, ts.tv_nsec); +} + +// time_date returns the Time corresponding to +// yyyy-mm-dd hh:mm:ss + nsec nanoseconds +// +// The month, day, hour, min, sec, and nsec values may be outside +// their usual ranges and will be normalized during the conversion. +// For example, October 32 converts to November 1. +// +// The time is converted to UTC using offset_sec in seconds east of UTC. +Time time_date(int year, + enum Month month, + int day, + int hour, + int min, + int sec, + int nsec, + int offset_sec) { + // Normalize month, overflowing into year. + int m = month - 1; + norm(year, m, 12, &year, &m); + month = m + 1; + + // Normalize nsec, sec, min, hour, overflowing into day. + norm(sec, nsec, 1000000000, &sec, &nsec); + norm(min, sec, 60, &min, &sec); + norm(hour, min, 60, &hour, &min); + norm(day, hour, 24, &day, &hour); + + // Compute days since the absolute epoch. + uint64_t d = days_since_epoch(year); + + // Add in days before this month. + d += days_before[month - 1]; + if (is_leap(year) && month >= March) { + d++; // February 29 + } + + // Add in days before today. + d += day - 1; + + // Add in time elapsed today. + uint64_t abs = d * seconds_per_day; + abs += hour * seconds_per_hour + min * seconds_per_minute + sec; + + // Convert to UTC. + abs -= offset_sec; + + return (Time){abs + absolute_to_internal, nsec}; +} + +#pragma endregion + +#pragma region Time parts + +// time_get_date returns the year, month, and day in which t occurs. +void time_get_date(Time t, int* year, enum Month* month, int* day) { + uint64_t abs = abs_time(t); + int yday; + abs_date_full(abs, year, month, day, &yday); +} + +// time_get_year returns the year in which t occurs. +int time_get_year(Time t) { + uint64_t abs = abs_time(t); + int year, yday; + abs_date(abs, &year, &yday); + return year; +} + +// time_get_month returns the month of the year specified by t. +enum Month time_get_month(Time t) { + uint64_t abs = abs_time(t); + int year, day, yday; + enum Month month; + abs_date_full(abs, &year, &month, &day, &yday); + return month; +} + +// time_get_day returns the day of the month specified by t. +int time_get_day(Time t) { + uint64_t abs = abs_time(t); + int year, day, yday; + enum Month month; + abs_date_full(abs, &year, &month, &day, &yday); + return day; +} + +// time_get_clock returns the hour, minute, and second within the day specified by t. +void time_get_clock(Time t, int* hour, int* min, int* sec) { + uint64_t abs = abs_time(t); + abs_clock(abs, hour, min, sec); +} + +// time_get_hour returns the hour within the day specified by t, in the range [0, 23]. +int time_get_hour(Time t) { + uint64_t abs = abs_time(t); + return (abs % seconds_per_day) / seconds_per_hour; +} + +// time_get_minute returns the minute offset within the hour specified by t, in the range [0, 59]. +int time_get_minute(Time t) { + uint64_t abs = abs_time(t); + return (abs % seconds_per_hour) / seconds_per_minute; +} + +// time_get_second returns the second offset within the minute specified by t, in the range [0, 59]. +int time_get_second(Time t) { + uint64_t abs = abs_time(t); + return abs % seconds_per_minute; +} + +// time_get_nano returns the nanosecond offset within the second specified by t, +// in the range [0, 999999999]. +int time_get_nano(Time t) { + return t.nsec; +} + +// time_get_weekday returns the day of the week specified by t. +enum Weekday time_get_weekday(Time t) { + uint64_t abs = abs_time(t); + return abs_weekday(abs); +} + +// time_get_yearday returns the day of the year specified by t, in the range [1,365] for non-leap +// years, and [1,366] in leap years. +int time_get_yearday(Time t) { + uint64_t abs = abs_time(t); + int year, yday; + abs_date(abs, &year, &yday); + return yday + 1; +} + +// time_get_isoweek returns the ISO 8601 year and week number in which t occurs. +// Week ranges from 1 to 53. Jan 01 to Jan 03 of year n might belong to +// week 52 or 53 of year n-1, and Dec 29 to Dec 31 might belong to week 1 of year n+1. +void time_get_isoweek(Time t, int* year, int* week) { + // According to the rule that the first calendar week of a calendar year is + // the week including the first Thursday of that year, and that the last one is + // the week immediately preceding the first calendar week of the next calendar year. + // See https://www.iso.org/obp/ui#iso:std:iso:8601:-1:ed-1:v1:en:term:3.1.1.23 for details. + + // weeks start with Monday + // Monday Tuesday Wednesday Thursday Friday Saturday Sunday + // 1 2 3 4 5 6 7 + // +3 +2 +1 0 -1 -2 -3 + // the offset to Thursday + uint64_t abs = abs_time(t); + int d = (Thursday - abs_weekday(abs)); + // handle Sunday + if (d == 4) { + d = -3; + } + // find the Thursday of the calendar week + int yday; + abs += d * seconds_per_day; + abs_date(abs, year, &yday); + *week = yday / 7 + 1; +} + +#pragma endregion + +#pragma region Unix time + +// time_unix returns the Time corresponding to the given Unix time, +// sec seconds and nsec nanoseconds since January 1, 1970 UTC. +// It is valid to pass nsec outside the range [0, 999999999]. +// Not all sec values have a corresponding time value. One such +// value is 1<<63-1 (the largest int64 value). +Time time_unix(int64_t sec, int64_t nsec) { + if (nsec < 0 || nsec >= 1000000000) { + int64_t n = nsec / 1000000000; + sec += n; + nsec -= n * 1000000000; + if (nsec < 0) { + nsec += 1000000000; + sec--; + } + } + return unix_time(sec, nsec); +} + +// time_milli returns the Time corresponding to the given Unix time, +// msec milliseconds since January 1, 1970 UTC. +Time time_milli(int64_t msec) { + return time_unix(msec / 1000, (msec % 1000) * 1000000); +} + +// time_micro returns the Time corresponding to the given Unix time, +// usec microseconds since January 1, 1970 UTC. +Time time_micro(int64_t usec) { + return time_unix(usec / 1000000, (usec % 1000000) * 1000); +} + +// time_nano returns the Time corresponding to the given Unix time, +// nsec nanoseconds since January 1, 1970 UTC. +Time time_nano(int64_t nsec) { + return time_unix(0, nsec); +} + +// time_to_unix returns t as a Unix time, the number of seconds elapsed +// since January 1, 1970 UTC. +// Unix-like operating systems often record time as a 32-bit +// count of seconds, but since the method here returns a 64-bit +// value it is valid for billions of years into the past or future. +int64_t time_to_unix(Time t) { + return unix_sec(t); +} + +// time_to_milli returns t as a Unix time, the number of milliseconds elapsed since +// January 1, 1970 UTC. The result is undefined if the Unix time in +// milliseconds cannot be represented by an int64 (a date more than 292 million +// years before or after 1970). +int64_t time_to_milli(Time t) { + return unix_sec(t) * 1000 + t.nsec / 1000000; +} + +// time_to_micro returns t as a Unix time, the number of microseconds elapsed since +// January 1, 1970 UTC. The result is undefined if the Unix time in +// microseconds cannot be represented by an int64 (a date before year -290307 or +// after year 294246). +int64_t time_to_micro(Time t) { + return unix_sec(t) * 1000000 + t.nsec / 1000; +} + +// time_to_nano returns t as a Unix time, the number of nanoseconds elapsed +// since January 1, 1970 UTC. The result is undefined if the Unix time +// in nanoseconds cannot be represented by an int64 (a date before the year +// 1678 or after 2262). Note that this means the result of calling UnixNano +// on the zero Time is undefined. +int64_t time_to_nano(Time t) { + return unix_sec(t) * 1000000000 + t.nsec; +} + +#pragma endregion + +#pragma region Calendar time + +// time_tm returns the Time corresponding to the given calendar time at the given timezone offset. +Time time_tm(struct tm tm, int offset_sec) { + int year = tm.tm_year + 1900; + int month = tm.tm_mon + 1; + int day = tm.tm_mday; + int hour = tm.tm_hour; + int min = tm.tm_min; + int sec = tm.tm_sec; + return time_date(year, month, day, hour, min, sec, 0, offset_sec); +} + +// time_to_tm returns t in the given timezone offset as a calendar time. +struct tm time_to_tm(Time t, int offset_sec) { + Time loc_t = time_add(t, offset_sec * Second); + int year, day, hour, min, sec; + enum Month month; + time_get_date(loc_t, &year, &month, &day); + time_get_clock(loc_t, &hour, &min, &sec); + struct tm tm = { + .tm_year = year - 1900, + .tm_mon = month - 1, + .tm_mday = day, + .tm_hour = hour, + .tm_min = min, + .tm_sec = sec, + .tm_isdst = -1, + }; + return tm; +} + +#pragma endregion + +#pragma region Comparison + +// time_after reports whether the time instant t is after u. +bool time_after(Time t, Time u) { + return t.sec > u.sec || (t.sec == u.sec && t.nsec > u.nsec); +} + +// time_before reports whether the time instant t is before u. +bool time_before(Time t, Time u) { + return t.sec < u.sec || (t.sec == u.sec && t.nsec < u.nsec); +} + +// time_compare compares the time instant t with u. If t is before u, it returns -1; +// if t is after u, it returns +1; if they're the same, it returns 0. +int time_compare(Time t, Time u) { + if (time_before(t, u)) { + return -1; + } + if (time_after(t, u)) { + return +1; + } + return 0; +} + +// time_equal reports whether t and u represent the same time instant. +bool time_equal(Time t, Time u) { + return t.sec == u.sec && t.nsec == u.nsec; +} + +// time_is_zero reports whether t represents the zero time instant, +// January 1, year 1, 00:00:00 UTC. +bool time_is_zero(Time t) { + return t.sec == 0 && t.nsec == 0; +} + +#pragma endregion + +#pragma region Arithmetic + +// time_add returns the time t+d. +Time time_add(Time t, Duration d) { + int64_t dsec = d / Second; + int64_t nsec = t.nsec + d % 1000000000; + if (nsec >= 1e9) { + dsec++; + nsec -= 1e9; + } else if (nsec < 0) { + dsec--; + nsec += 1e9; + } + return (Time){t.sec + dsec, nsec}; +} + +// time_sub returns the duration t-u. If the result exceeds the maximum (or minimum) +// value that can be stored in a Duration, the maximum (or minimum) duration +// will be returned. +Duration time_sub(Time t, Time u) { + int64_t d = (t.sec - u.sec) * Second + (t.nsec - u.nsec); + if (time_equal(time_add(u, d), t)) { + return d; // d is correct + } + if (time_before(t, u)) { + return MIN_DURATION; // t - u is negative out of range + } + return MAX_DURATION; // t - u is positive out of range +} + +// time_since returns the time elapsed since t. +// It is shorthand for time_sub(time_now(), t). +Duration time_since(Time t) { + return time_sub(time_now(), t); +} + +// time_until returns the duration until t. +// It is shorthand for time_sub(t, time_now()). +Duration time_until(Time t) { + return time_sub(t, time_now()); +} + +// time_add_date returns the time corresponding to adding the +// given number of years, months, and days to t. +// For example, time_add_date(-1, 2, 3) applied to January 1, 2011 +// returns March 4, 2010. +// +// time_add_date normalizes its result in the same way that Date does, +// so, for example, adding one month to October 31 yields +// December 1, the normalized form for November 31. +Time time_add_date(Time t, int years, int months, int days) { + int year, day; + enum Month month; + time_get_date(t, &year, &month, &day); + int hour, min, sec; + time_get_clock(t, &hour, &min, &sec); + return time_date(year + years, month + months, day + days, hour, min, sec, t.nsec, TIMEX_UTC); +} + +#pragma endregion + +#pragma region Rounding + +// time_truncate returns the result of rounding t down to a multiple of d (since the zero time). +// Only supports d which is a multiple of 1 second. If d <= 0, returns t unchanged. +Time time_truncate(Time t, Duration d) { + if (d <= 0) { + return t; + } + Duration r = time_div(t, d); + return time_add(t, -r); +} + +// time_round returns the result of rounding t to the nearest multiple of d (since the zero time). +// The rounding behavior for halfway values is to round up. +// If d <= 0, returns t unchanged. +Time time_round(Time t, Duration d) { + if (d <= 0) { + return t; + } + Duration r = time_div(t, d); + if (tless_than_half(r, d)) { + return time_add(t, -r); + } + return time_add(t, d - r); +} + +#pragma endregion + +#pragma region Formatting + +// time_fmt_iso returns an ISO 8601 time string for the given time value. +// Converts the time value to the given timezone offset before formatting. +// Chooses the most compact representation: +// - 2006-01-02T15:04:05.999999999+07:00 +// - 2006-01-02T15:04:05.999999999Z +// - 2006-01-02T15:04:05+07:00 +// - 2006-01-02T15:04:05Z +size_t time_fmt_iso(char* buf, size_t size, Time t, int offset_sec) { + int year, day, hour, min, sec; + enum Month month; + const char* layout; + size_t n = 0; + + if (offset_sec == 0) { + time_get_date(t, &year, &month, &day); + time_get_clock(t, &hour, &min, &sec); + if (t.nsec == 0) { + layout = "%04d-%02d-%02dT%02d:%02d:%02dZ"; + n = snprintf(buf, size, layout, year, month, day, hour, min, sec); + } else { + layout = "%04d-%02d-%02dT%02d:%02d:%02d.%09dZ"; + n = snprintf(buf, size, layout, year, month, day, hour, min, sec, t.nsec); + } + } else { + Time loc_t = time_add(t, offset_sec * Second); + time_get_date(loc_t, &year, &month, &day); + time_get_clock(loc_t, &hour, &min, &sec); + int ofhour = offset_sec / 3600; + int ofmin = (offset_sec % 3600) / 60; + if (ofmin < 0) { + ofmin = -ofmin; + } + if (loc_t.nsec == 0) { + layout = "%04d-%02d-%02dT%02d:%02d:%02d%+03d:%02d"; + n = snprintf(buf, size, layout, year, month, day, hour, min, sec, ofhour, ofmin); + } else { + layout = "%04d-%02d-%02dT%02d:%02d:%02d.%09d%+03d:%02d"; + n = snprintf(buf, size, layout, year, month, day, hour, min, sec, loc_t.nsec, ofhour, + ofmin); + } + } + return n; +} + +// time_fmt_datetime returns a datetime string +// (2006-01-02 15:04:05) for the given time value. +// Converts the time value to the given timezone offset before formatting. +size_t time_fmt_datetime(char* buf, size_t size, Time t, int offset_sec) { + int year, day, hour, min, sec; + enum Month month; + if (offset_sec == 0) { + time_get_date(t, &year, &month, &day); + time_get_clock(t, &hour, &min, &sec); + } else { + Time loc_t = time_add(t, offset_sec * Second); + time_get_date(loc_t, &year, &month, &day); + time_get_clock(loc_t, &hour, &min, &sec); + } + return snprintf(buf, size, "%04d-%02d-%02d %02d:%02d:%02d", year, month, day, hour, min, sec); +} + +// time_fmt_date returns a date string +// (2006-01-02) for the given time value. +// Converts the time value to the given timezone offset before formatting. +size_t time_fmt_date(char* buf, size_t size, Time t, int offset_sec) { + int year, day; + enum Month month; + if (offset_sec == 0) { + time_get_date(t, &year, &month, &day); + } else { + Time loc_t = time_add(t, offset_sec * Second); + time_get_date(loc_t, &year, &month, &day); + } + return snprintf(buf, size, "%04d-%02d-%02d", year, month, day); +} + +// time_fmt_time returns a time string +// (15:04:05) for the given time value. +// Converts the time value to the given timezone offset before formatting. +size_t time_fmt_time(char* buf, size_t size, Time t, int offset_sec) { + int hour, min, sec; + if (offset_sec == 0) { + time_get_clock(t, &hour, &min, &sec); + } else { + Time loc_t = time_add(t, offset_sec * Second); + time_get_clock(loc_t, &hour, &min, &sec); + } + return snprintf(buf, size, "%02d:%02d:%02d", hour, min, sec); +} + +// time_parse parses a formatted string and returns the time value it represents. +// Supports a limited set of layouts: +// - "2006-01-02T15:04:05.999999999+07:00" (ISO 8601 with nanoseconds and timezone) +// - "2006-01-02T15:04:05.999999999Z" (ISO 8601 with nanoseconds, UTC) +// - "2006-01-02T15:04:05+07:00" (ISO 8601 with timezone) +// - "2006-01-02T15:04:05Z" (ISO 8601, UTC) +// - "2006-01-02 15:04:05" (date and time, UTC) +// - "2006-01-02" (date only, UTC) +// - "15:04:05" (time only, UTC) +Time time_parse(const char* value) { + Time zero = {0, 0}; + size_t len = strlen(value); + if (len < 8 || len > 35) { + return zero; + } + + int year = 1, month = 1, day = 1, hour = 0, min = 0, sec = 0, nsec = 0, offset_sec = TIMEX_UTC; + char tz[7] = ""; + + if (len == 35) { + // "2006-01-02T15:04:05.999999999+07:00" + int n = sscanf(value, "%d-%d-%dT%d:%d:%d.%d%6s", &year, &month, &day, &hour, &min, &sec, + &nsec, tz); + if (n != 8) { + return zero; + } + } + + if (len == 30) { + // "2006-01-02T15:04:05.999999999Z" + int n = + sscanf(value, "%d-%d-%dT%d:%d:%d.%dZ", &year, &month, &day, &hour, &min, &sec, &nsec); + if (n != 7) { + return zero; + } + } + + if (len == 25) { + // "2006-01-02T15:04:05+07:00" + int n = sscanf(value, "%d-%d-%dT%d:%d:%d%6s", &year, &month, &day, &hour, &min, &sec, tz); + if (n != 7) { + return zero; + } + } + + if (len == 19 || len == 20) { + // "2006-01-02T15:04:05Z" + // "2006-01-02 15:04:05" + int n = sscanf(value, "%d-%d-%d%*c%d:%d:%d", &year, &month, &day, &hour, &min, &sec); + if (n != 6) { + return zero; + } + } + + if (len == 10) { + // "2006-01-02" + int n = sscanf(value, "%d-%d-%d", &year, &month, &day); + if (n != 3) { + return zero; + } + } + + if (len == 8) { + // "15:04:05" + int n = sscanf(value, "%d:%d:%d", &hour, &min, &sec); + if (n != 3) { + return zero; + } + } + + if (tz[0] != '\0') { + // Parse timezone offset. + // + 0 7 : 0 0 + // ⁰ ¹ ² ³ ⁴ ⁵ + // tz[0] is the sign. + int sign = (tz[0] == '-') ? -1 : 1; + // tz[1] and tz[2] are hours. + offset_sec = ((tz[1] - '0') * 10 + (tz[2] - '0')) * 3600 * sign; + // tz[4] and tz[5] are minutes. + offset_sec += ((tz[4] - '0') * 10 + (tz[5] - '0')) * 60 * sign; + } + + return time_date(year, (enum Month)month, day, hour, min, sec, nsec, offset_sec); +} + +#pragma endregion + +#pragma region Marshaling + +// time_blob returns the time instant represented by the binary data. +// The blob must have been created by time_to_blob and be at least 13 bytes long. +Time time_blob(const uint8_t* buf) { + const uint8_t version = buf[0]; + if (version != 1) { + return (Time){0, 0}; + } + + int64_t sec = (int64_t)buf[8] | (int64_t)buf[7] << 8 | (int64_t)buf[6] << 16 | + (int64_t)buf[5] << 24 | (int64_t)buf[4] << 32 | (int64_t)buf[3] << 40 | + (int64_t)buf[2] << 48 | (int64_t)buf[1] << 56; + + int32_t nsec = + (int32_t)buf[12] | (int32_t)buf[11] << 8 | (int32_t)buf[10] << 16 | (int32_t)buf[9] << 24; + + return (Time){sec, nsec}; +} + +// time_to_blob returns the binary representation of the time instant t. +// The result is a byte slice with the following layout: +// 0: version (currently 1) +// 1-8: seconds +// 9-12: nanoseconds +void time_to_blob(Time t, uint8_t* buf) { + const uint8_t version = 1; + buf[0] = version; + buf[1] = t.sec >> 56; // bytes 1-8: seconds + buf[2] = t.sec >> 48; + buf[3] = t.sec >> 40; + buf[4] = t.sec >> 32; + buf[5] = t.sec >> 24; + buf[6] = t.sec >> 16; + buf[7] = t.sec >> 8; + buf[8] = t.sec; + buf[9] = t.nsec >> 24; // bytes 9-12: nanoseconds + buf[10] = t.nsec >> 16; + buf[11] = t.nsec >> 8; + buf[12] = t.nsec; +} + +#pragma endregion diff --git a/libsql-ffi/bundled/sqlean/time/timex.h b/libsql-ffi/bundled/sqlean/time/timex.h new file mode 100644 index 0000000000..4047ff48b3 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/time/timex.h @@ -0,0 +1,270 @@ +// Copyright (c) 2024 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Based on Go's time package, BSD 3-Clause License +// https://github.com/golang/go + +// Package timex provides functionality for working with time. +// The calendrical calculations always assume a Gregorian calendar, with no leap seconds. + +#ifndef TIMEX_H +#define TIMEX_H + +#include +#include +#include + +// Month is a month of the year. +enum Month { + January = 1, + February, + March, + April, + May, + June, + July, + August, + September, + October, + November, + December, +}; + +// Weekday is a day of the week (Sunday = 0, ...). +enum Weekday { + Sunday = 0, + Monday, + Tuesday, + Wednesday, + Thursday, + Friday, + Saturday, +}; + +// Time represents an instant in time with nanosecond precision. +// The zero value is January 1, year 1, 00:00:00.000000000 UTC. +typedef struct { + int64_t sec; // seconds since zero time + int32_t nsec; // nanoseconds within the second [0, 999999999] +} Time; + +#define TIMEX_BLOB_SIZE 13 +#define TIMEX_UTC 0 + +// Duration represents the elapsed time between two instants +// as an int64 nanosecond count. The representation limits the +// largest representable duration to approximately 290 years. +typedef int64_t Duration; + +// --- Time --- + +// Constructors. + +// time_now returns the current time in UTC. +Time time_now(void); + +// time_date returns the Time corresponding to +// yyyy-mm-dd hh:mm:ss + nsec nanoseconds +// with the given timezone offset in seconds. +Time time_date(int year, + enum Month month, + int day, + int hour, + int min, + int sec, + int nsec, + int offset_sec); + +// Time parts. + +// time_get_date returns the year, month, and day in which t occurs. +void time_get_date(Time t, int* year, enum Month* month, int* day); + +// time_get_year returns the year in which t occurs. +int time_get_year(Time t); + +// time_get_month returns the month of the year specified by t. +enum Month time_get_month(Time t); + +// time_get_day returns the day of the month specified by t. +int time_get_day(Time t); + +// time_get_clock returns the hour, minute, and second within the day specified by t. +void time_get_clock(Time t, int* hour, int* min, int* sec); + +// time_get_hour returns the hour within the day specified by t. +int time_get_hour(Time t); + +// time_get_minute returns the minute offset within the hour specified by t. +int time_get_minute(Time t); + +// time_get_second returns the second offset within the minute specified by t. +int time_get_second(Time t); + +// time_get_nano returns the nanosecond offset within the second specified by t. +int time_get_nano(Time t); + +// time_get_weekday returns the day of the week specified by t. +enum Weekday time_get_weekday(Time t); + +// time_get_yearday returns the day of the year specified by t. +int time_get_yearday(Time t); + +// time_get_isoweek returns the ISO 8601 year and week number in which t occurs. +void time_get_isoweek(Time t, int* year, int* week); + +// Unix time. + +// time_unix returns the Time corresponding to the given Unix time, +// sec seconds and nsec nanoseconds since January 1, 1970 UTC. +Time time_unix(int64_t sec, int64_t nsec); + +// time_milli returns the Time corresponding to the given Unix time, +// msec milliseconds since January 1, 1970 UTC. +Time time_milli(int64_t msec); + +// time_micro returns the local Time corresponding to the given Unix time, +// usec microseconds since January 1, 1970 UTC. +Time time_micro(int64_t usec); + +// time_nano returns the Time corresponding to the given Unix time, +// nsec nanoseconds since January 1, 1970 UTC. +Time time_nano(int64_t nsec); + +// time_to_unix returns t as a Unix time, the number of seconds elapsed +// since January 1, 1970 UTC. +int64_t time_to_unix(Time t); + +// time_to_milli returns t as a Unix time, the number of milliseconds elapsed since +// January 1, 1970 UTC. +int64_t time_to_milli(Time t); + +// time_to_micro returns t as a Unix time, the number of microseconds elapsed since +// January 1, 1970 UTC. +int64_t time_to_micro(Time t); + +// time_to_nano returns t as a Unix time, the number of nanoseconds elapsed +// since January 1, 1970 UTC. +int64_t time_to_nano(Time t); + +// Calendar time. + +// time_tm returns the Time corresponding to the given calendar time at the given timezone offset. +Time time_tm(struct tm tm, int offset_sec); + +// time_to_tm returns t in the given timezone offset as a calendar time. +struct tm time_to_tm(Time t, int offset_sec); + +// Comparison. + +// time_after reports whether the time instant t is after u. +bool time_after(Time t, Time u); + +// time_before reports whether the time instant t is before u. +bool time_before(Time t, Time u); + +// time_compare compares the time instant t with u. +int time_compare(Time t, Time u); + +// time_equal reports whether t and u represent the same time instant. +bool time_equal(Time t, Time u); + +// time_is_zero reports whether t represents the zero time instant, +// January 1, year 1, 00:00:00 UTC. +bool time_is_zero(Time t); + +// Arithmetic. + +// time_add returns the time t+d. +Time time_add(Time t, Duration d); + +// time_sub returns the duration t-u. +Duration time_sub(Time t, Time u); + +// time_since returns the time elapsed since t. +Duration time_since(Time t); + +// time_until returns the duration until t. +Duration time_until(Time t); + +// time_add_date returns the time corresponding to adding the +// given number of years, months, and days to t. +Time time_add_date(Time t, int years, int months, int days); + +// Rounding. + +// time_truncate returns the result of rounding t down to a multiple of d. +Time time_truncate(Time t, Duration d); + +// time_round returns the result of rounding t to the nearest multiple of d. +Time time_round(Time t, Duration d); + +// Formatting. + +// time_fmt_iso returns an ISO 8601 time string for the given time value. +size_t time_fmt_iso(char* buf, size_t size, Time t, int offset_sec); + +// time_fmt_datetime returns a datetime string for the given time value. +size_t time_fmt_datetime(char* buf, size_t size, Time t, int offset_sec); + +// time_fmt_date returns a date string for the given time value. +size_t time_fmt_date(char* buf, size_t size, Time t, int offset_sec); + +// time_fmt_time returns a time string for the given time value. +size_t time_fmt_time(char* buf, size_t size, Time t, int offset_sec); + +// time_parse parses a formatted string and returns the time value it represents. +Time time_parse(const char* value); + +// Marshaling. + +// time_blob returns the time instant represented by the binary data. +Time time_blob(const uint8_t* buf); + +// time_to_blob returns the binary representation of the time instant t. +void time_to_blob(Time t, uint8_t* buf); + +// --- Duration --- + +// Min/Max durations. +#define MIN_DURATION INT64_MIN +#define MAX_DURATION INT64_MAX + +// Common durations. There is no definition for units of Day or larger +// to avoid confusion across daylight savings time zone transitions. +extern const Duration Nanosecond; +extern const Duration Microsecond; +extern const Duration Millisecond; +extern const Duration Second; +extern const Duration Minute; +extern const Duration Hour; + +// Conversion. + +// dur_to_micro returns the duration as an integer microsecond count. +int64_t dur_to_micro(Duration d); + +// dur_to_milli returns the duration as an integer millisecond count. +int64_t dur_to_milli(Duration d); + +// dur_to_seconds returns the duration as a floating point number of seconds. +double dur_to_seconds(Duration d); + +// dur_to_minutes returns the duration as a floating point number of minutes. +double dur_to_minutes(Duration d); + +// dur_to_hours returns the duration as a floating point number of hours. +double dur_to_hours(Duration d); + +// Rounding. + +// dur_truncate returns the result of rounding d toward zero to a multiple of m. +Duration dur_truncate(Duration d, Duration m); + +// dur_round returns the result of rounding d to the nearest multiple of m. +Duration dur_round(Duration d, Duration m); + +// dur_abs returns the absolute value of d. +Duration dur_abs(Duration d); + +#endif /* TIMEX_H */ diff --git a/libsql-ffi/bundled/sqlean/unicode/extension.c b/libsql-ffi/bundled/sqlean/unicode/extension.c new file mode 100644 index 0000000000..6eba3c9f89 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/unicode/extension.c @@ -0,0 +1,5393 @@ +// Originally by Unknown Author, Public Domain +// https://github.com/Zensey/sqlite3_unicode + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Unicode support for SQLite. + +/* + * Implements case-insensitive string comparison for Unicode strings. + * Provides the following Unicode features: + * + * - upper(), lower() and casefold() functions to normalize case. + * - like() function and LIKE operator with case-independent matching. + * - unaccent() function to normalize strings by removing accents. + * + * Tries to override the default NOCASE case-insensitive collation sequence + * to support UTF-8 characters (available in SQLite CLI and C API only). + * + * Compile the project with the SQLITE_ENABLE_UNICODE preprocessor definition + * in order to enable the code below. + */ + +/* +** Un|Comment to provide additional unicode support to SQLite3 or adjust size for unused features +*/ +#define SQLITE3_UNICODE_FOLD // ~ 10KB increase +#define SQLITE3_UNICODE_LOWER // ~ 10KB increase +#define SQLITE3_UNICODE_UPPER // ~ 10KB increase +// #define SQLITE3_UNICODE_TITLE // ~ 10KB increase +#define SQLITE3_UNICODE_UNACC // ~ 30KB increase + +/* +** SQLITE3_UNICODE_COLLATE will register and use the custom nocase collation instead of the standard +** one, which supports case folding and unaccenting. +*/ +#define SQLITE3_UNICODE_COLLATE // requires SQLITE3_UNICODE_FOLD to be defined as well. + +/* +** SQLITE3_UNICODE_UNACC_AUTOMATIC will automatically try to unaccent any characters that +** are over the 0x80 character in the LIKE comparison operation and in the NOCASE collation +*sequence. +*/ +#define SQLITE3_UNICODE_UNACC_AUTOMATIC // requires SQLITE3_UNICODE_UNACC to be defined as well. + +/************************************************************************************************* +** DO NOT MODIFY BELOW THIS LINE +**************************************************************************************************/ + +/* Generated by builder. Do not modify. Start unicode_version_defines */ +/* +File was generated by : sqlite3_unicode.in +File was generated on : Fri Jun 5 01:10:23 2009 +Using unicode data db : UnicodeData.txt +Using unicode fold db : CaseFolding.txt +*/ +#define SQLITE3_UNICODE_VERSION_MAJOR 5 +#define SQLITE3_UNICODE_VERSION_MINOR 1 +#define SQLITE3_UNICODE_VERSION_MICRO 0 +#define SQLITE3_UNICODE_VERSION_BUILD 12 + +#define __SQLITE3_UNICODE_VERSION_STRING(a, b, c, d) #a "." #b "." #c "." #d +#define _SQLITE3_UNICODE_VERSION_STRING(a, b, c, d) __SQLITE3_UNICODE_VERSION_STRING(a, b, c, d) +#define SQLITE3_UNICODE_VERSION_STRING \ + _SQLITE3_UNICODE_VERSION_STRING(SQLITE3_UNICODE_VERSION_MAJOR, SQLITE3_UNICODE_VERSION_MINOR, \ + SQLITE3_UNICODE_VERSION_MICRO, SQLITE3_UNICODE_VERSION_BUILD) + +/* Generated by builder. Do not modify. End unicode_version_defines */ + +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#ifndef _SQLITE3_UNICODE_H +#define _SQLITE3_UNICODE_H + +/* +** Add the ability to override 'extern' +*/ +/* +** +** The define of SQLITE_EXPORT is necessary to add the ability of exporting +** functions for both Microsoft Windows and Linux systems without the need +** of a .def file containing the names of the functions being exported. +*/ +#ifndef SQLITE_EXPORT +#if ((defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || \ + defined(__BORLANDC__)) && \ + (!defined(SQLITE_CORE))) +#define SQLITE_EXPORT __declspec(dllexport) +#else +#define SQLITE_EXPORT SQLITE_EXTERN +#endif +#endif + +#ifndef SQLITE_PRIVATE +#define SQLITE_PRIVATE static +#endif +#ifndef SQLITE_API +#define SQLITE_API +#endif + +/* +** Integers of known sizes. These typedefs might change for architectures +** where the sizes very. Preprocessor macros are available so that the +** types can be conveniently redefined at compile-type. Like this: +** +** cc '-DUINTPTR_TYPE=long long int' ... +*/ +#ifndef UINT32_TYPE +#ifdef HAVE_UINT32_T +#define UINT32_TYPE uint32_t +#else +#define UINT32_TYPE unsigned int +#endif +#endif +#ifndef UINT16_TYPE +#ifdef HAVE_UINT16_T +#define UINT16_TYPE uint16_t +#else +#define UINT16_TYPE unsigned short int +#endif +#endif +#ifndef INT16_TYPE +#ifdef HAVE_INT16_T +#define INT16_TYPE int16_t +#else +#define INT16_TYPE short int +#endif +#endif +#ifndef UINT8_TYPE +#ifdef HAVE_UINT8_T +#define UINT8_TYPE uint8_t +#else +#define UINT8_TYPE unsigned char +#endif +#endif +#ifndef INT8_TYPE +#ifdef HAVE_INT8_T +#define INT8_TYPE int8_t +#else +#define INT8_TYPE signed char +#endif +#endif +#ifndef LONGDOUBLE_TYPE +#define LONGDOUBLE_TYPE long double +#endif +typedef sqlite_int64 i64; /* 8-byte signed integer */ +typedef sqlite_uint64 u64; /* 8-byte unsigned integer */ +typedef UINT32_TYPE u32; /* 4-byte unsigned integer */ +typedef UINT16_TYPE u16; /* 2-byte unsigned integer */ +typedef INT16_TYPE i16; /* 2-byte signed integer */ +typedef UINT8_TYPE u8; /* 1-byte unsigned integer */ +typedef INT8_TYPE i8; /* 1-byte signed integer */ + +/* +** +** These functions are intended for case conversion of single characters +** and return a single character containing the case converted character +** based on the unicode mapping tables. +*/ +SQLITE_EXPORT u16 sqlite3_unicode_fold(u16 c); +SQLITE_EXPORT u16 sqlite3_unicode_lower(u16 c); +SQLITE_EXPORT u16 sqlite3_unicode_upper(u16 c); +SQLITE_EXPORT u16 sqlite3_unicode_title(u16 c); + +/* +** +** This function is intended for decomposing of single characters +** and return a pointer of characters (u16 **)p containing the decomposed +** character or string of characters. (int *)l will contain the length +** of characters contained in (u16 **)p based on the unicode mapping tables. +*/ +SQLITE_EXPORT u16 sqlite3_unicode_unacc(u16 c, u16** p, int* l); + +/* +** Another built-in collating sequence: NOCASE. +** +** This collating sequence is intended to be used for "case independant +** comparison". SQLite's knowledge of upper and lower case equivalents +** extends only to the 26 characters used in the English language. +** +** At the moment there is only a UTF-8 implementation. +*/ +/* +** +** The built-in collating sequence: NOCASE is extended to accomodate the +** unicode case folding mapping tables to normalize characters to their +** fold equivalents and test them for equality. +** +** Both UTF-8 and UTF-16 implementations are supported. +** +** (void *)encoding takes the following values +** * SQLITE_UTF8 for UTF-8 encoded string comparison +** * SQLITE_UFT16 for UTF-16 encoded string comparison +*/ +SQLITE_EXPORT int sqlite3_unicode_collate(void* encoding, + int nKey1, + const void* pKey1, + int nKey2, + const void* pKey2); + +/* +** +** The following function needs to be called at application startup to load the extension. +*/ +SQLITE_EXPORT int sqlite3_unicode_load(); + +/* +** +** The following function needs to be called before application exit to unload the extension. +*/ +SQLITE_EXPORT void sqlite3_unicode_free(); + +#endif /* _SQLITE3_UNICODE_H */ +/************************************************************************************************* + ************************************************************************************************* + *************************************************************************************************/ + +#ifdef SQLITE3_UNICODE_FOLD +/* Generated by builder. Do not modify. Start unicode_fold_defines */ +#define UNICODE_FOLD_BLOCK_SHIFT 5 +#define UNICODE_FOLD_BLOCK_MASK ((1 << UNICODE_FOLD_BLOCK_SHIFT) - 1) +#define UNICODE_FOLD_BLOCK_SIZE (1 << UNICODE_FOLD_BLOCK_SHIFT) +#define UNICODE_FOLD_BLOCK_COUNT 69 +#define UNICODE_FOLD_INDEXES_SIZE (0x10000 >> UNICODE_FOLD_BLOCK_SHIFT) +/* Generated by builder. Do not modify. End unicode_fold_defines */ + +/* Generated by builder. Do not modify. Start unicode_fold_tables */ + +static unsigned short unicode_fold_indexes[UNICODE_FOLD_INDEXES_SIZE] = { + 0, 0, 1, 0, 0, 2, 3, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 0, 0, 0, 0, + 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 0, 23, 24, 25, 26, 27, 28, 29, 30, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 49, 0, 50, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 52, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, 55, 0, 56, 57, 58, 59, 60, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61, 62, 63, 0, 0, 0, 0, 64, 65, 66, 67, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 68, 0, 0, 0, 0, 0, 0}; + +static unsigned char unicode_fold_positions[UNICODE_FOLD_BLOCK_COUNT][UNICODE_FOLD_BLOCK_SIZE + 1] = + { + /* 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 1 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 2 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 3 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 4 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 5 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 6 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 7 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 8 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 9 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 10 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 11 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 12 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 13 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 14 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 15 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 16 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 17 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 18 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 19 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 20 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 21 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 22 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 23 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 24 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 25 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 26 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 27 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 28 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 29 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 30 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 32 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 33 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 34 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 35 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 36 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 37 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 38 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 39 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 40 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 41 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 42 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 43 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 44 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 45 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 46 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 47 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 48 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 49 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 50 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 51 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 52 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 53 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 54 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 55 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 56 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 57 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 58 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 59 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 60 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 61 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 62 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 63 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 64 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 65 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 66 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 67 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 68 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}}; + +static unsigned short unicode_fold_data0[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data1[] = { + 0xFFFF, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, + 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, + 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data2[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03BC, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data3[] = { + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, + 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, + 0x00F6, 0xFFFF, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0xFFFF}; +static unsigned short unicode_fold_data4[] = { + 0x0101, 0xFFFF, 0x0103, 0xFFFF, 0x0105, 0xFFFF, 0x0107, 0xFFFF, 0x0109, 0xFFFF, 0x010B, + 0xFFFF, 0x010D, 0xFFFF, 0x010F, 0xFFFF, 0x0111, 0xFFFF, 0x0113, 0xFFFF, 0x0115, 0xFFFF, + 0x0117, 0xFFFF, 0x0119, 0xFFFF, 0x011B, 0xFFFF, 0x011D, 0xFFFF, 0x011F, 0xFFFF}; +static unsigned short unicode_fold_data5[] = { + 0x0121, 0xFFFF, 0x0123, 0xFFFF, 0x0125, 0xFFFF, 0x0127, 0xFFFF, 0x0129, 0xFFFF, 0x012B, + 0xFFFF, 0x012D, 0xFFFF, 0x012F, 0xFFFF, 0xFFFF, 0xFFFF, 0x0133, 0xFFFF, 0x0135, 0xFFFF, + 0x0137, 0xFFFF, 0xFFFF, 0x013A, 0xFFFF, 0x013C, 0xFFFF, 0x013E, 0xFFFF, 0x0140}; +static unsigned short unicode_fold_data6[] = { + 0xFFFF, 0x0142, 0xFFFF, 0x0144, 0xFFFF, 0x0146, 0xFFFF, 0x0148, 0xFFFF, 0xFFFF, 0x014B, + 0xFFFF, 0x014D, 0xFFFF, 0x014F, 0xFFFF, 0x0151, 0xFFFF, 0x0153, 0xFFFF, 0x0155, 0xFFFF, + 0x0157, 0xFFFF, 0x0159, 0xFFFF, 0x015B, 0xFFFF, 0x015D, 0xFFFF, 0x015F, 0xFFFF}; +static unsigned short unicode_fold_data7[] = { + 0x0161, 0xFFFF, 0x0163, 0xFFFF, 0x0165, 0xFFFF, 0x0167, 0xFFFF, 0x0169, 0xFFFF, 0x016B, + 0xFFFF, 0x016D, 0xFFFF, 0x016F, 0xFFFF, 0x0171, 0xFFFF, 0x0173, 0xFFFF, 0x0175, 0xFFFF, + 0x0177, 0xFFFF, 0x00FF, 0x017A, 0xFFFF, 0x017C, 0xFFFF, 0x017E, 0xFFFF, 0x0073}; +static unsigned short unicode_fold_data8[] = { + 0xFFFF, 0x0253, 0x0183, 0xFFFF, 0x0185, 0xFFFF, 0x0254, 0x0188, 0xFFFF, 0x0256, 0x0257, + 0x018C, 0xFFFF, 0xFFFF, 0x01DD, 0x0259, 0x025B, 0x0192, 0xFFFF, 0x0260, 0x0263, 0xFFFF, + 0x0269, 0x0268, 0x0199, 0xFFFF, 0xFFFF, 0xFFFF, 0x026F, 0x0272, 0xFFFF, 0x0275}; +static unsigned short unicode_fold_data9[] = { + 0x01A1, 0xFFFF, 0x01A3, 0xFFFF, 0x01A5, 0xFFFF, 0x0280, 0x01A8, 0xFFFF, 0x0283, 0xFFFF, + 0xFFFF, 0x01AD, 0xFFFF, 0x0288, 0x01B0, 0xFFFF, 0x028A, 0x028B, 0x01B4, 0xFFFF, 0x01B6, + 0xFFFF, 0x0292, 0x01B9, 0xFFFF, 0xFFFF, 0xFFFF, 0x01BD, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data10[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01C6, 0x01C6, 0xFFFF, 0x01C9, 0x01C9, 0xFFFF, 0x01CC, + 0x01CC, 0xFFFF, 0x01CE, 0xFFFF, 0x01D0, 0xFFFF, 0x01D2, 0xFFFF, 0x01D4, 0xFFFF, 0x01D6, + 0xFFFF, 0x01D8, 0xFFFF, 0x01DA, 0xFFFF, 0x01DC, 0xFFFF, 0xFFFF, 0x01DF, 0xFFFF}; +static unsigned short unicode_fold_data11[] = { + 0x01E1, 0xFFFF, 0x01E3, 0xFFFF, 0x01E5, 0xFFFF, 0x01E7, 0xFFFF, 0x01E9, 0xFFFF, 0x01EB, + 0xFFFF, 0x01ED, 0xFFFF, 0x01EF, 0xFFFF, 0xFFFF, 0x01F3, 0x01F3, 0xFFFF, 0x01F5, 0xFFFF, + 0x0195, 0x01BF, 0x01F9, 0xFFFF, 0x01FB, 0xFFFF, 0x01FD, 0xFFFF, 0x01FF, 0xFFFF}; +static unsigned short unicode_fold_data12[] = { + 0x0201, 0xFFFF, 0x0203, 0xFFFF, 0x0205, 0xFFFF, 0x0207, 0xFFFF, 0x0209, 0xFFFF, 0x020B, + 0xFFFF, 0x020D, 0xFFFF, 0x020F, 0xFFFF, 0x0211, 0xFFFF, 0x0213, 0xFFFF, 0x0215, 0xFFFF, + 0x0217, 0xFFFF, 0x0219, 0xFFFF, 0x021B, 0xFFFF, 0x021D, 0xFFFF, 0x021F, 0xFFFF}; +static unsigned short unicode_fold_data13[] = { + 0x019E, 0xFFFF, 0x0223, 0xFFFF, 0x0225, 0xFFFF, 0x0227, 0xFFFF, 0x0229, 0xFFFF, 0x022B, + 0xFFFF, 0x022D, 0xFFFF, 0x022F, 0xFFFF, 0x0231, 0xFFFF, 0x0233, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C65, 0x023C, 0xFFFF, 0x019A, 0x2C66, 0xFFFF}; +static unsigned short unicode_fold_data14[] = { + 0xFFFF, 0x0242, 0xFFFF, 0x0180, 0x0289, 0x028C, 0x0247, 0xFFFF, 0x0249, 0xFFFF, 0x024B, + 0xFFFF, 0x024D, 0xFFFF, 0x024F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data15[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03B9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data16[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0371, 0xFFFF, 0x0373, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0377, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data17[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03AC, 0xFFFF, 0x03AD, 0x03AE, 0x03AF, + 0xFFFF, 0x03CC, 0xFFFF, 0x03CD, 0x03CE, 0xFFFF, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, + 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF}; +static unsigned short unicode_fold_data18[] = { + 0x03C0, 0x03C1, 0xFFFF, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, + 0x03CB, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data19[] = { + 0xFFFF, 0xFFFF, 0x03C3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03D7, 0x03B2, 0x03B8, 0xFFFF, 0xFFFF, 0xFFFF, 0x03C6, + 0x03C0, 0xFFFF, 0x03D9, 0xFFFF, 0x03DB, 0xFFFF, 0x03DD, 0xFFFF, 0x03DF, 0xFFFF}; +static unsigned short unicode_fold_data20[] = { + 0x03E1, 0xFFFF, 0x03E3, 0xFFFF, 0x03E5, 0xFFFF, 0x03E7, 0xFFFF, 0x03E9, 0xFFFF, 0x03EB, + 0xFFFF, 0x03ED, 0xFFFF, 0x03EF, 0xFFFF, 0x03BA, 0x03C1, 0xFFFF, 0xFFFF, 0x03B8, 0x03B5, + 0xFFFF, 0x03F8, 0xFFFF, 0x03F2, 0x03FB, 0xFFFF, 0xFFFF, 0x037B, 0x037C, 0x037D}; +static unsigned short unicode_fold_data21[] = { + 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A, + 0x045B, 0x045C, 0x045D, 0x045E, 0x045F, 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, + 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F}; +static unsigned short unicode_fold_data22[] = { + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, + 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data23[] = { + 0x0461, 0xFFFF, 0x0463, 0xFFFF, 0x0465, 0xFFFF, 0x0467, 0xFFFF, 0x0469, 0xFFFF, 0x046B, + 0xFFFF, 0x046D, 0xFFFF, 0x046F, 0xFFFF, 0x0471, 0xFFFF, 0x0473, 0xFFFF, 0x0475, 0xFFFF, + 0x0477, 0xFFFF, 0x0479, 0xFFFF, 0x047B, 0xFFFF, 0x047D, 0xFFFF, 0x047F, 0xFFFF}; +static unsigned short unicode_fold_data24[] = { + 0x0481, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x048B, + 0xFFFF, 0x048D, 0xFFFF, 0x048F, 0xFFFF, 0x0491, 0xFFFF, 0x0493, 0xFFFF, 0x0495, 0xFFFF, + 0x0497, 0xFFFF, 0x0499, 0xFFFF, 0x049B, 0xFFFF, 0x049D, 0xFFFF, 0x049F, 0xFFFF}; +static unsigned short unicode_fold_data25[] = { + 0x04A1, 0xFFFF, 0x04A3, 0xFFFF, 0x04A5, 0xFFFF, 0x04A7, 0xFFFF, 0x04A9, 0xFFFF, 0x04AB, + 0xFFFF, 0x04AD, 0xFFFF, 0x04AF, 0xFFFF, 0x04B1, 0xFFFF, 0x04B3, 0xFFFF, 0x04B5, 0xFFFF, + 0x04B7, 0xFFFF, 0x04B9, 0xFFFF, 0x04BB, 0xFFFF, 0x04BD, 0xFFFF, 0x04BF, 0xFFFF}; +static unsigned short unicode_fold_data26[] = { + 0x04CF, 0x04C2, 0xFFFF, 0x04C4, 0xFFFF, 0x04C6, 0xFFFF, 0x04C8, 0xFFFF, 0x04CA, 0xFFFF, + 0x04CC, 0xFFFF, 0x04CE, 0xFFFF, 0xFFFF, 0x04D1, 0xFFFF, 0x04D3, 0xFFFF, 0x04D5, 0xFFFF, + 0x04D7, 0xFFFF, 0x04D9, 0xFFFF, 0x04DB, 0xFFFF, 0x04DD, 0xFFFF, 0x04DF, 0xFFFF}; +static unsigned short unicode_fold_data27[] = { + 0x04E1, 0xFFFF, 0x04E3, 0xFFFF, 0x04E5, 0xFFFF, 0x04E7, 0xFFFF, 0x04E9, 0xFFFF, 0x04EB, + 0xFFFF, 0x04ED, 0xFFFF, 0x04EF, 0xFFFF, 0x04F1, 0xFFFF, 0x04F3, 0xFFFF, 0x04F5, 0xFFFF, + 0x04F7, 0xFFFF, 0x04F9, 0xFFFF, 0x04FB, 0xFFFF, 0x04FD, 0xFFFF, 0x04FF, 0xFFFF}; +static unsigned short unicode_fold_data28[] = { + 0x0501, 0xFFFF, 0x0503, 0xFFFF, 0x0505, 0xFFFF, 0x0507, 0xFFFF, 0x0509, 0xFFFF, 0x050B, + 0xFFFF, 0x050D, 0xFFFF, 0x050F, 0xFFFF, 0x0511, 0xFFFF, 0x0513, 0xFFFF, 0x0515, 0xFFFF, + 0x0517, 0xFFFF, 0x0519, 0xFFFF, 0x051B, 0xFFFF, 0x051D, 0xFFFF, 0x051F, 0xFFFF}; +static unsigned short unicode_fold_data29[] = { + 0x0521, 0xFFFF, 0x0523, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, + 0x0566, 0x0567, 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F}; +static unsigned short unicode_fold_data30[] = { + 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578, 0x0579, 0x057A, + 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, + 0x0586, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data31[] = { + 0x2D00, 0x2D01, 0x2D02, 0x2D03, 0x2D04, 0x2D05, 0x2D06, 0x2D07, 0x2D08, 0x2D09, 0x2D0A, + 0x2D0B, 0x2D0C, 0x2D0D, 0x2D0E, 0x2D0F, 0x2D10, 0x2D11, 0x2D12, 0x2D13, 0x2D14, 0x2D15, + 0x2D16, 0x2D17, 0x2D18, 0x2D19, 0x2D1A, 0x2D1B, 0x2D1C, 0x2D1D, 0x2D1E, 0x2D1F}; +static unsigned short unicode_fold_data32[] = { + 0x2D20, 0x2D21, 0x2D22, 0x2D23, 0x2D24, 0x2D25, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data33[] = { + 0x1E01, 0xFFFF, 0x1E03, 0xFFFF, 0x1E05, 0xFFFF, 0x1E07, 0xFFFF, 0x1E09, 0xFFFF, 0x1E0B, + 0xFFFF, 0x1E0D, 0xFFFF, 0x1E0F, 0xFFFF, 0x1E11, 0xFFFF, 0x1E13, 0xFFFF, 0x1E15, 0xFFFF, + 0x1E17, 0xFFFF, 0x1E19, 0xFFFF, 0x1E1B, 0xFFFF, 0x1E1D, 0xFFFF, 0x1E1F, 0xFFFF}; +static unsigned short unicode_fold_data34[] = { + 0x1E21, 0xFFFF, 0x1E23, 0xFFFF, 0x1E25, 0xFFFF, 0x1E27, 0xFFFF, 0x1E29, 0xFFFF, 0x1E2B, + 0xFFFF, 0x1E2D, 0xFFFF, 0x1E2F, 0xFFFF, 0x1E31, 0xFFFF, 0x1E33, 0xFFFF, 0x1E35, 0xFFFF, + 0x1E37, 0xFFFF, 0x1E39, 0xFFFF, 0x1E3B, 0xFFFF, 0x1E3D, 0xFFFF, 0x1E3F, 0xFFFF}; +static unsigned short unicode_fold_data35[] = { + 0x1E41, 0xFFFF, 0x1E43, 0xFFFF, 0x1E45, 0xFFFF, 0x1E47, 0xFFFF, 0x1E49, 0xFFFF, 0x1E4B, + 0xFFFF, 0x1E4D, 0xFFFF, 0x1E4F, 0xFFFF, 0x1E51, 0xFFFF, 0x1E53, 0xFFFF, 0x1E55, 0xFFFF, + 0x1E57, 0xFFFF, 0x1E59, 0xFFFF, 0x1E5B, 0xFFFF, 0x1E5D, 0xFFFF, 0x1E5F, 0xFFFF}; +static unsigned short unicode_fold_data36[] = { + 0x1E61, 0xFFFF, 0x1E63, 0xFFFF, 0x1E65, 0xFFFF, 0x1E67, 0xFFFF, 0x1E69, 0xFFFF, 0x1E6B, + 0xFFFF, 0x1E6D, 0xFFFF, 0x1E6F, 0xFFFF, 0x1E71, 0xFFFF, 0x1E73, 0xFFFF, 0x1E75, 0xFFFF, + 0x1E77, 0xFFFF, 0x1E79, 0xFFFF, 0x1E7B, 0xFFFF, 0x1E7D, 0xFFFF, 0x1E7F, 0xFFFF}; +static unsigned short unicode_fold_data37[] = { + 0x1E81, 0xFFFF, 0x1E83, 0xFFFF, 0x1E85, 0xFFFF, 0x1E87, 0xFFFF, 0x1E89, 0xFFFF, 0x1E8B, + 0xFFFF, 0x1E8D, 0xFFFF, 0x1E8F, 0xFFFF, 0x1E91, 0xFFFF, 0x1E93, 0xFFFF, 0x1E95, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1E61, 0xFFFF, 0xFFFF, 0x00DF, 0xFFFF}; +static unsigned short unicode_fold_data38[] = { + 0x1EA1, 0xFFFF, 0x1EA3, 0xFFFF, 0x1EA5, 0xFFFF, 0x1EA7, 0xFFFF, 0x1EA9, 0xFFFF, 0x1EAB, + 0xFFFF, 0x1EAD, 0xFFFF, 0x1EAF, 0xFFFF, 0x1EB1, 0xFFFF, 0x1EB3, 0xFFFF, 0x1EB5, 0xFFFF, + 0x1EB7, 0xFFFF, 0x1EB9, 0xFFFF, 0x1EBB, 0xFFFF, 0x1EBD, 0xFFFF, 0x1EBF, 0xFFFF}; +static unsigned short unicode_fold_data39[] = { + 0x1EC1, 0xFFFF, 0x1EC3, 0xFFFF, 0x1EC5, 0xFFFF, 0x1EC7, 0xFFFF, 0x1EC9, 0xFFFF, 0x1ECB, + 0xFFFF, 0x1ECD, 0xFFFF, 0x1ECF, 0xFFFF, 0x1ED1, 0xFFFF, 0x1ED3, 0xFFFF, 0x1ED5, 0xFFFF, + 0x1ED7, 0xFFFF, 0x1ED9, 0xFFFF, 0x1EDB, 0xFFFF, 0x1EDD, 0xFFFF, 0x1EDF, 0xFFFF}; +static unsigned short unicode_fold_data40[] = { + 0x1EE1, 0xFFFF, 0x1EE3, 0xFFFF, 0x1EE5, 0xFFFF, 0x1EE7, 0xFFFF, 0x1EE9, 0xFFFF, 0x1EEB, + 0xFFFF, 0x1EED, 0xFFFF, 0x1EEF, 0xFFFF, 0x1EF1, 0xFFFF, 0x1EF3, 0xFFFF, 0x1EF5, 0xFFFF, + 0x1EF7, 0xFFFF, 0x1EF9, 0xFFFF, 0x1EFB, 0xFFFF, 0x1EFD, 0xFFFF, 0x1EFF, 0xFFFF}; +static unsigned short unicode_fold_data41[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F00, 0x1F01, 0x1F02, + 0x1F03, 0x1F04, 0x1F05, 0x1F06, 0x1F07, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F10, 0x1F11, 0x1F12, 0x1F13, 0x1F14, 0x1F15, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data42[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F20, 0x1F21, 0x1F22, + 0x1F23, 0x1F24, 0x1F25, 0x1F26, 0x1F27, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F30, 0x1F31, 0x1F32, 0x1F33, 0x1F34, 0x1F35, 0x1F36, 0x1F37}; +static unsigned short unicode_fold_data43[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F40, 0x1F41, 0x1F42, + 0x1F43, 0x1F44, 0x1F45, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x1F51, 0xFFFF, 0x1F53, 0xFFFF, 0x1F55, 0xFFFF, 0x1F57}; +static unsigned short unicode_fold_data44[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F60, 0x1F61, 0x1F62, + 0x1F63, 0x1F64, 0x1F65, 0x1F66, 0x1F67, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data45[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F80, 0x1F81, 0x1F82, + 0x1F83, 0x1F84, 0x1F85, 0x1F86, 0x1F87, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F90, 0x1F91, 0x1F92, 0x1F93, 0x1F94, 0x1F95, 0x1F96, 0x1F97}; +static unsigned short unicode_fold_data46[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FA0, 0x1FA1, 0x1FA2, + 0x1FA3, 0x1FA4, 0x1FA5, 0x1FA6, 0x1FA7, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1FB0, 0x1FB1, 0x1F70, 0x1F71, 0x1FB3, 0xFFFF, 0x03B9, 0xFFFF}; +static unsigned short unicode_fold_data47[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F72, 0x1F73, 0x1F74, + 0x1F75, 0x1FC3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1FD0, 0x1FD1, 0x1F76, 0x1F77, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data48[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FE0, 0x1FE1, 0x1F7A, + 0x1F7B, 0x1FE5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F78, 0x1F79, 0x1F7C, 0x1F7D, 0x1FF3, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data49[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03C9, 0xFFFF, 0xFFFF, 0xFFFF, 0x006B, + 0x00E5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x214E, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data50[] = { + 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217A, + 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data51[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x2184, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data52[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x24D0, 0x24D1, 0x24D2, 0x24D3, 0x24D4, 0x24D5, 0x24D6, 0x24D7, 0x24D8, 0x24D9}; +static unsigned short unicode_fold_data53[] = { + 0x24DA, 0x24DB, 0x24DC, 0x24DD, 0x24DE, 0x24DF, 0x24E0, 0x24E1, 0x24E2, 0x24E3, 0x24E4, + 0x24E5, 0x24E6, 0x24E7, 0x24E8, 0x24E9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data54[] = { + 0x2C30, 0x2C31, 0x2C32, 0x2C33, 0x2C34, 0x2C35, 0x2C36, 0x2C37, 0x2C38, 0x2C39, 0x2C3A, + 0x2C3B, 0x2C3C, 0x2C3D, 0x2C3E, 0x2C3F, 0x2C40, 0x2C41, 0x2C42, 0x2C43, 0x2C44, 0x2C45, + 0x2C46, 0x2C47, 0x2C48, 0x2C49, 0x2C4A, 0x2C4B, 0x2C4C, 0x2C4D, 0x2C4E, 0x2C4F}; +static unsigned short unicode_fold_data55[] = { + 0x2C50, 0x2C51, 0x2C52, 0x2C53, 0x2C54, 0x2C55, 0x2C56, 0x2C57, 0x2C58, 0x2C59, 0x2C5A, + 0x2C5B, 0x2C5C, 0x2C5D, 0x2C5E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data56[] = { + 0x2C61, 0xFFFF, 0x026B, 0x1D7D, 0x027D, 0xFFFF, 0xFFFF, 0x2C68, 0xFFFF, 0x2C6A, 0xFFFF, + 0x2C6C, 0xFFFF, 0x0251, 0x0271, 0x0250, 0xFFFF, 0xFFFF, 0x2C73, 0xFFFF, 0xFFFF, 0x2C76, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data57[] = { + 0x2C81, 0xFFFF, 0x2C83, 0xFFFF, 0x2C85, 0xFFFF, 0x2C87, 0xFFFF, 0x2C89, 0xFFFF, 0x2C8B, + 0xFFFF, 0x2C8D, 0xFFFF, 0x2C8F, 0xFFFF, 0x2C91, 0xFFFF, 0x2C93, 0xFFFF, 0x2C95, 0xFFFF, + 0x2C97, 0xFFFF, 0x2C99, 0xFFFF, 0x2C9B, 0xFFFF, 0x2C9D, 0xFFFF, 0x2C9F, 0xFFFF}; +static unsigned short unicode_fold_data58[] = { + 0x2CA1, 0xFFFF, 0x2CA3, 0xFFFF, 0x2CA5, 0xFFFF, 0x2CA7, 0xFFFF, 0x2CA9, 0xFFFF, 0x2CAB, + 0xFFFF, 0x2CAD, 0xFFFF, 0x2CAF, 0xFFFF, 0x2CB1, 0xFFFF, 0x2CB3, 0xFFFF, 0x2CB5, 0xFFFF, + 0x2CB7, 0xFFFF, 0x2CB9, 0xFFFF, 0x2CBB, 0xFFFF, 0x2CBD, 0xFFFF, 0x2CBF, 0xFFFF}; +static unsigned short unicode_fold_data59[] = { + 0x2CC1, 0xFFFF, 0x2CC3, 0xFFFF, 0x2CC5, 0xFFFF, 0x2CC7, 0xFFFF, 0x2CC9, 0xFFFF, 0x2CCB, + 0xFFFF, 0x2CCD, 0xFFFF, 0x2CCF, 0xFFFF, 0x2CD1, 0xFFFF, 0x2CD3, 0xFFFF, 0x2CD5, 0xFFFF, + 0x2CD7, 0xFFFF, 0x2CD9, 0xFFFF, 0x2CDB, 0xFFFF, 0x2CDD, 0xFFFF, 0x2CDF, 0xFFFF}; +static unsigned short unicode_fold_data60[] = { + 0x2CE1, 0xFFFF, 0x2CE3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data61[] = { + 0xA641, 0xFFFF, 0xA643, 0xFFFF, 0xA645, 0xFFFF, 0xA647, 0xFFFF, 0xA649, 0xFFFF, 0xA64B, + 0xFFFF, 0xA64D, 0xFFFF, 0xA64F, 0xFFFF, 0xA651, 0xFFFF, 0xA653, 0xFFFF, 0xA655, 0xFFFF, + 0xA657, 0xFFFF, 0xA659, 0xFFFF, 0xA65B, 0xFFFF, 0xA65D, 0xFFFF, 0xA65F, 0xFFFF}; +static unsigned short unicode_fold_data62[] = { + 0xFFFF, 0xFFFF, 0xA663, 0xFFFF, 0xA665, 0xFFFF, 0xA667, 0xFFFF, 0xA669, 0xFFFF, 0xA66B, + 0xFFFF, 0xA66D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data63[] = { + 0xA681, 0xFFFF, 0xA683, 0xFFFF, 0xA685, 0xFFFF, 0xA687, 0xFFFF, 0xA689, 0xFFFF, 0xA68B, + 0xFFFF, 0xA68D, 0xFFFF, 0xA68F, 0xFFFF, 0xA691, 0xFFFF, 0xA693, 0xFFFF, 0xA695, 0xFFFF, + 0xA697, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data64[] = { + 0xFFFF, 0xFFFF, 0xA723, 0xFFFF, 0xA725, 0xFFFF, 0xA727, 0xFFFF, 0xA729, 0xFFFF, 0xA72B, + 0xFFFF, 0xA72D, 0xFFFF, 0xA72F, 0xFFFF, 0xFFFF, 0xFFFF, 0xA733, 0xFFFF, 0xA735, 0xFFFF, + 0xA737, 0xFFFF, 0xA739, 0xFFFF, 0xA73B, 0xFFFF, 0xA73D, 0xFFFF, 0xA73F, 0xFFFF}; +static unsigned short unicode_fold_data65[] = { + 0xA741, 0xFFFF, 0xA743, 0xFFFF, 0xA745, 0xFFFF, 0xA747, 0xFFFF, 0xA749, 0xFFFF, 0xA74B, + 0xFFFF, 0xA74D, 0xFFFF, 0xA74F, 0xFFFF, 0xA751, 0xFFFF, 0xA753, 0xFFFF, 0xA755, 0xFFFF, + 0xA757, 0xFFFF, 0xA759, 0xFFFF, 0xA75B, 0xFFFF, 0xA75D, 0xFFFF, 0xA75F, 0xFFFF}; +static unsigned short unicode_fold_data66[] = { + 0xA761, 0xFFFF, 0xA763, 0xFFFF, 0xA765, 0xFFFF, 0xA767, 0xFFFF, 0xA769, 0xFFFF, 0xA76B, + 0xFFFF, 0xA76D, 0xFFFF, 0xA76F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xA77A, 0xFFFF, 0xA77C, 0xFFFF, 0x1D79, 0xA77F, 0xFFFF}; +static unsigned short unicode_fold_data67[] = { + 0xA781, 0xFFFF, 0xA783, 0xFFFF, 0xA785, 0xFFFF, 0xA787, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xA78C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_fold_data68[] = { + 0xFFFF, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, + 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, + 0xFF56, 0xFF57, 0xFF58, 0xFF59, 0xFF5A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; + +static unsigned short* unicode_fold_data_table[UNICODE_FOLD_BLOCK_COUNT] = { + unicode_fold_data0, unicode_fold_data1, unicode_fold_data2, unicode_fold_data3, + unicode_fold_data4, unicode_fold_data5, unicode_fold_data6, unicode_fold_data7, + unicode_fold_data8, unicode_fold_data9, unicode_fold_data10, unicode_fold_data11, + unicode_fold_data12, unicode_fold_data13, unicode_fold_data14, unicode_fold_data15, + unicode_fold_data16, unicode_fold_data17, unicode_fold_data18, unicode_fold_data19, + unicode_fold_data20, unicode_fold_data21, unicode_fold_data22, unicode_fold_data23, + unicode_fold_data24, unicode_fold_data25, unicode_fold_data26, unicode_fold_data27, + unicode_fold_data28, unicode_fold_data29, unicode_fold_data30, unicode_fold_data31, + unicode_fold_data32, unicode_fold_data33, unicode_fold_data34, unicode_fold_data35, + unicode_fold_data36, unicode_fold_data37, unicode_fold_data38, unicode_fold_data39, + unicode_fold_data40, unicode_fold_data41, unicode_fold_data42, unicode_fold_data43, + unicode_fold_data44, unicode_fold_data45, unicode_fold_data46, unicode_fold_data47, + unicode_fold_data48, unicode_fold_data49, unicode_fold_data50, unicode_fold_data51, + unicode_fold_data52, unicode_fold_data53, unicode_fold_data54, unicode_fold_data55, + unicode_fold_data56, unicode_fold_data57, unicode_fold_data58, unicode_fold_data59, + unicode_fold_data60, unicode_fold_data61, unicode_fold_data62, unicode_fold_data63, + unicode_fold_data64, unicode_fold_data65, unicode_fold_data66, unicode_fold_data67, + unicode_fold_data68}; +/* Generated by builder. Do not modify. End unicode_fold_tables */ + +SQLITE_EXPORT u16 sqlite3_unicode_fold(u16 c) { + u16 index = unicode_fold_indexes[(c) >> UNICODE_FOLD_BLOCK_SHIFT]; + u8 position = (c)&UNICODE_FOLD_BLOCK_MASK; + u16(p) = (unicode_fold_data_table[index][unicode_fold_positions[index][position]]); + int l = unicode_fold_positions[index][position + 1] - unicode_fold_positions[index][position]; + + return ((l == 1) && ((p) == 0xFFFF)) ? c : p; +} +#endif + +#ifdef SQLITE3_UNICODE_LOWER +/* Generated by builder. Do not modify. Start unicode_lower_defines */ +#define UNICODE_LOWER_BLOCK_SHIFT 5 +#define UNICODE_LOWER_BLOCK_MASK ((1 << UNICODE_LOWER_BLOCK_SHIFT) - 1) +#define UNICODE_LOWER_BLOCK_SIZE (1 << UNICODE_LOWER_BLOCK_SHIFT) +#define UNICODE_LOWER_BLOCK_COUNT 67 +#define UNICODE_LOWER_INDEXES_SIZE (0x10000 >> UNICODE_LOWER_BLOCK_SHIFT) +/* Generated by builder. Do not modify. End unicode_lower_defines */ + +/* Generated by builder. Do not modify. Start unicode_lower_tables */ + +static unsigned short unicode_lower_indexes[UNICODE_LOWER_INDEXES_SIZE] = { + 0, 0, 1, 0, 0, 0, 2, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 0, 0, 0, 0, + 0, 0, 0, 14, 15, 16, 17, 18, 19, 20, 0, 21, 22, 23, 24, 25, 26, 27, 28, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 47, 0, 48, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 50, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 53, 0, 54, 55, 56, 57, 58, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 60, 61, 0, 0, 0, 0, 62, 63, 64, 65, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 66, 0, 0, 0, 0, 0, 0}; + +static unsigned char + unicode_lower_positions[UNICODE_LOWER_BLOCK_COUNT][UNICODE_LOWER_BLOCK_SIZE + 1] = { + /* 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 1 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 2 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 3 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 4 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 5 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 6 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 7 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 8 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 9 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 10 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 11 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 12 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 13 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 14 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 15 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 16 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 17 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 18 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 19 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 20 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 21 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 22 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 23 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 24 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 25 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 26 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 27 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 28 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 29 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 30 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 32 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 33 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 34 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 35 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 36 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 37 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 38 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 39 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 40 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 41 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 42 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 43 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 44 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 45 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 46 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 47 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 48 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 49 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 50 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 51 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 52 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 53 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 54 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 55 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 56 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 57 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 58 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 59 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 60 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 61 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 62 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 63 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 64 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 65 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 66 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}}; + +static unsigned short unicode_lower_data0[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data1[] = { + 0xFFFF, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, + 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, + 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data2[] = { + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, + 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, + 0x00F6, 0xFFFF, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0xFFFF}; +static unsigned short unicode_lower_data3[] = { + 0x0101, 0xFFFF, 0x0103, 0xFFFF, 0x0105, 0xFFFF, 0x0107, 0xFFFF, 0x0109, 0xFFFF, 0x010B, + 0xFFFF, 0x010D, 0xFFFF, 0x010F, 0xFFFF, 0x0111, 0xFFFF, 0x0113, 0xFFFF, 0x0115, 0xFFFF, + 0x0117, 0xFFFF, 0x0119, 0xFFFF, 0x011B, 0xFFFF, 0x011D, 0xFFFF, 0x011F, 0xFFFF}; +static unsigned short unicode_lower_data4[] = { + 0x0121, 0xFFFF, 0x0123, 0xFFFF, 0x0125, 0xFFFF, 0x0127, 0xFFFF, 0x0129, 0xFFFF, 0x012B, + 0xFFFF, 0x012D, 0xFFFF, 0x012F, 0xFFFF, 0x0069, 0xFFFF, 0x0133, 0xFFFF, 0x0135, 0xFFFF, + 0x0137, 0xFFFF, 0xFFFF, 0x013A, 0xFFFF, 0x013C, 0xFFFF, 0x013E, 0xFFFF, 0x0140}; +static unsigned short unicode_lower_data5[] = { + 0xFFFF, 0x0142, 0xFFFF, 0x0144, 0xFFFF, 0x0146, 0xFFFF, 0x0148, 0xFFFF, 0xFFFF, 0x014B, + 0xFFFF, 0x014D, 0xFFFF, 0x014F, 0xFFFF, 0x0151, 0xFFFF, 0x0153, 0xFFFF, 0x0155, 0xFFFF, + 0x0157, 0xFFFF, 0x0159, 0xFFFF, 0x015B, 0xFFFF, 0x015D, 0xFFFF, 0x015F, 0xFFFF}; +static unsigned short unicode_lower_data6[] = { + 0x0161, 0xFFFF, 0x0163, 0xFFFF, 0x0165, 0xFFFF, 0x0167, 0xFFFF, 0x0169, 0xFFFF, 0x016B, + 0xFFFF, 0x016D, 0xFFFF, 0x016F, 0xFFFF, 0x0171, 0xFFFF, 0x0173, 0xFFFF, 0x0175, 0xFFFF, + 0x0177, 0xFFFF, 0x00FF, 0x017A, 0xFFFF, 0x017C, 0xFFFF, 0x017E, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data7[] = { + 0xFFFF, 0x0253, 0x0183, 0xFFFF, 0x0185, 0xFFFF, 0x0254, 0x0188, 0xFFFF, 0x0256, 0x0257, + 0x018C, 0xFFFF, 0xFFFF, 0x01DD, 0x0259, 0x025B, 0x0192, 0xFFFF, 0x0260, 0x0263, 0xFFFF, + 0x0269, 0x0268, 0x0199, 0xFFFF, 0xFFFF, 0xFFFF, 0x026F, 0x0272, 0xFFFF, 0x0275}; +static unsigned short unicode_lower_data8[] = { + 0x01A1, 0xFFFF, 0x01A3, 0xFFFF, 0x01A5, 0xFFFF, 0x0280, 0x01A8, 0xFFFF, 0x0283, 0xFFFF, + 0xFFFF, 0x01AD, 0xFFFF, 0x0288, 0x01B0, 0xFFFF, 0x028A, 0x028B, 0x01B4, 0xFFFF, 0x01B6, + 0xFFFF, 0x0292, 0x01B9, 0xFFFF, 0xFFFF, 0xFFFF, 0x01BD, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data9[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01C6, 0x01C6, 0xFFFF, 0x01C9, 0x01C9, 0xFFFF, 0x01CC, + 0x01CC, 0xFFFF, 0x01CE, 0xFFFF, 0x01D0, 0xFFFF, 0x01D2, 0xFFFF, 0x01D4, 0xFFFF, 0x01D6, + 0xFFFF, 0x01D8, 0xFFFF, 0x01DA, 0xFFFF, 0x01DC, 0xFFFF, 0xFFFF, 0x01DF, 0xFFFF}; +static unsigned short unicode_lower_data10[] = { + 0x01E1, 0xFFFF, 0x01E3, 0xFFFF, 0x01E5, 0xFFFF, 0x01E7, 0xFFFF, 0x01E9, 0xFFFF, 0x01EB, + 0xFFFF, 0x01ED, 0xFFFF, 0x01EF, 0xFFFF, 0xFFFF, 0x01F3, 0x01F3, 0xFFFF, 0x01F5, 0xFFFF, + 0x0195, 0x01BF, 0x01F9, 0xFFFF, 0x01FB, 0xFFFF, 0x01FD, 0xFFFF, 0x01FF, 0xFFFF}; +static unsigned short unicode_lower_data11[] = { + 0x0201, 0xFFFF, 0x0203, 0xFFFF, 0x0205, 0xFFFF, 0x0207, 0xFFFF, 0x0209, 0xFFFF, 0x020B, + 0xFFFF, 0x020D, 0xFFFF, 0x020F, 0xFFFF, 0x0211, 0xFFFF, 0x0213, 0xFFFF, 0x0215, 0xFFFF, + 0x0217, 0xFFFF, 0x0219, 0xFFFF, 0x021B, 0xFFFF, 0x021D, 0xFFFF, 0x021F, 0xFFFF}; +static unsigned short unicode_lower_data12[] = { + 0x019E, 0xFFFF, 0x0223, 0xFFFF, 0x0225, 0xFFFF, 0x0227, 0xFFFF, 0x0229, 0xFFFF, 0x022B, + 0xFFFF, 0x022D, 0xFFFF, 0x022F, 0xFFFF, 0x0231, 0xFFFF, 0x0233, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C65, 0x023C, 0xFFFF, 0x019A, 0x2C66, 0xFFFF}; +static unsigned short unicode_lower_data13[] = { + 0xFFFF, 0x0242, 0xFFFF, 0x0180, 0x0289, 0x028C, 0x0247, 0xFFFF, 0x0249, 0xFFFF, 0x024B, + 0xFFFF, 0x024D, 0xFFFF, 0x024F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data14[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0371, 0xFFFF, 0x0373, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0377, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data15[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03AC, 0xFFFF, 0x03AD, 0x03AE, 0x03AF, + 0xFFFF, 0x03CC, 0xFFFF, 0x03CD, 0x03CE, 0xFFFF, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, + 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF}; +static unsigned short unicode_lower_data16[] = { + 0x03C0, 0x03C1, 0xFFFF, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, + 0x03CB, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data17[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03D7, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x03D9, 0xFFFF, 0x03DB, 0xFFFF, 0x03DD, 0xFFFF, 0x03DF, 0xFFFF}; +static unsigned short unicode_lower_data18[] = { + 0x03E1, 0xFFFF, 0x03E3, 0xFFFF, 0x03E5, 0xFFFF, 0x03E7, 0xFFFF, 0x03E9, 0xFFFF, 0x03EB, + 0xFFFF, 0x03ED, 0xFFFF, 0x03EF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03B8, 0xFFFF, + 0xFFFF, 0x03F8, 0xFFFF, 0x03F2, 0x03FB, 0xFFFF, 0xFFFF, 0x037B, 0x037C, 0x037D}; +static unsigned short unicode_lower_data19[] = { + 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A, + 0x045B, 0x045C, 0x045D, 0x045E, 0x045F, 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, + 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F}; +static unsigned short unicode_lower_data20[] = { + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, + 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data21[] = { + 0x0461, 0xFFFF, 0x0463, 0xFFFF, 0x0465, 0xFFFF, 0x0467, 0xFFFF, 0x0469, 0xFFFF, 0x046B, + 0xFFFF, 0x046D, 0xFFFF, 0x046F, 0xFFFF, 0x0471, 0xFFFF, 0x0473, 0xFFFF, 0x0475, 0xFFFF, + 0x0477, 0xFFFF, 0x0479, 0xFFFF, 0x047B, 0xFFFF, 0x047D, 0xFFFF, 0x047F, 0xFFFF}; +static unsigned short unicode_lower_data22[] = { + 0x0481, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x048B, + 0xFFFF, 0x048D, 0xFFFF, 0x048F, 0xFFFF, 0x0491, 0xFFFF, 0x0493, 0xFFFF, 0x0495, 0xFFFF, + 0x0497, 0xFFFF, 0x0499, 0xFFFF, 0x049B, 0xFFFF, 0x049D, 0xFFFF, 0x049F, 0xFFFF}; +static unsigned short unicode_lower_data23[] = { + 0x04A1, 0xFFFF, 0x04A3, 0xFFFF, 0x04A5, 0xFFFF, 0x04A7, 0xFFFF, 0x04A9, 0xFFFF, 0x04AB, + 0xFFFF, 0x04AD, 0xFFFF, 0x04AF, 0xFFFF, 0x04B1, 0xFFFF, 0x04B3, 0xFFFF, 0x04B5, 0xFFFF, + 0x04B7, 0xFFFF, 0x04B9, 0xFFFF, 0x04BB, 0xFFFF, 0x04BD, 0xFFFF, 0x04BF, 0xFFFF}; +static unsigned short unicode_lower_data24[] = { + 0x04CF, 0x04C2, 0xFFFF, 0x04C4, 0xFFFF, 0x04C6, 0xFFFF, 0x04C8, 0xFFFF, 0x04CA, 0xFFFF, + 0x04CC, 0xFFFF, 0x04CE, 0xFFFF, 0xFFFF, 0x04D1, 0xFFFF, 0x04D3, 0xFFFF, 0x04D5, 0xFFFF, + 0x04D7, 0xFFFF, 0x04D9, 0xFFFF, 0x04DB, 0xFFFF, 0x04DD, 0xFFFF, 0x04DF, 0xFFFF}; +static unsigned short unicode_lower_data25[] = { + 0x04E1, 0xFFFF, 0x04E3, 0xFFFF, 0x04E5, 0xFFFF, 0x04E7, 0xFFFF, 0x04E9, 0xFFFF, 0x04EB, + 0xFFFF, 0x04ED, 0xFFFF, 0x04EF, 0xFFFF, 0x04F1, 0xFFFF, 0x04F3, 0xFFFF, 0x04F5, 0xFFFF, + 0x04F7, 0xFFFF, 0x04F9, 0xFFFF, 0x04FB, 0xFFFF, 0x04FD, 0xFFFF, 0x04FF, 0xFFFF}; +static unsigned short unicode_lower_data26[] = { + 0x0501, 0xFFFF, 0x0503, 0xFFFF, 0x0505, 0xFFFF, 0x0507, 0xFFFF, 0x0509, 0xFFFF, 0x050B, + 0xFFFF, 0x050D, 0xFFFF, 0x050F, 0xFFFF, 0x0511, 0xFFFF, 0x0513, 0xFFFF, 0x0515, 0xFFFF, + 0x0517, 0xFFFF, 0x0519, 0xFFFF, 0x051B, 0xFFFF, 0x051D, 0xFFFF, 0x051F, 0xFFFF}; +static unsigned short unicode_lower_data27[] = { + 0x0521, 0xFFFF, 0x0523, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, + 0x0566, 0x0567, 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F}; +static unsigned short unicode_lower_data28[] = { + 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578, 0x0579, 0x057A, + 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, + 0x0586, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data29[] = { + 0x2D00, 0x2D01, 0x2D02, 0x2D03, 0x2D04, 0x2D05, 0x2D06, 0x2D07, 0x2D08, 0x2D09, 0x2D0A, + 0x2D0B, 0x2D0C, 0x2D0D, 0x2D0E, 0x2D0F, 0x2D10, 0x2D11, 0x2D12, 0x2D13, 0x2D14, 0x2D15, + 0x2D16, 0x2D17, 0x2D18, 0x2D19, 0x2D1A, 0x2D1B, 0x2D1C, 0x2D1D, 0x2D1E, 0x2D1F}; +static unsigned short unicode_lower_data30[] = { + 0x2D20, 0x2D21, 0x2D22, 0x2D23, 0x2D24, 0x2D25, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data31[] = { + 0x1E01, 0xFFFF, 0x1E03, 0xFFFF, 0x1E05, 0xFFFF, 0x1E07, 0xFFFF, 0x1E09, 0xFFFF, 0x1E0B, + 0xFFFF, 0x1E0D, 0xFFFF, 0x1E0F, 0xFFFF, 0x1E11, 0xFFFF, 0x1E13, 0xFFFF, 0x1E15, 0xFFFF, + 0x1E17, 0xFFFF, 0x1E19, 0xFFFF, 0x1E1B, 0xFFFF, 0x1E1D, 0xFFFF, 0x1E1F, 0xFFFF}; +static unsigned short unicode_lower_data32[] = { + 0x1E21, 0xFFFF, 0x1E23, 0xFFFF, 0x1E25, 0xFFFF, 0x1E27, 0xFFFF, 0x1E29, 0xFFFF, 0x1E2B, + 0xFFFF, 0x1E2D, 0xFFFF, 0x1E2F, 0xFFFF, 0x1E31, 0xFFFF, 0x1E33, 0xFFFF, 0x1E35, 0xFFFF, + 0x1E37, 0xFFFF, 0x1E39, 0xFFFF, 0x1E3B, 0xFFFF, 0x1E3D, 0xFFFF, 0x1E3F, 0xFFFF}; +static unsigned short unicode_lower_data33[] = { + 0x1E41, 0xFFFF, 0x1E43, 0xFFFF, 0x1E45, 0xFFFF, 0x1E47, 0xFFFF, 0x1E49, 0xFFFF, 0x1E4B, + 0xFFFF, 0x1E4D, 0xFFFF, 0x1E4F, 0xFFFF, 0x1E51, 0xFFFF, 0x1E53, 0xFFFF, 0x1E55, 0xFFFF, + 0x1E57, 0xFFFF, 0x1E59, 0xFFFF, 0x1E5B, 0xFFFF, 0x1E5D, 0xFFFF, 0x1E5F, 0xFFFF}; +static unsigned short unicode_lower_data34[] = { + 0x1E61, 0xFFFF, 0x1E63, 0xFFFF, 0x1E65, 0xFFFF, 0x1E67, 0xFFFF, 0x1E69, 0xFFFF, 0x1E6B, + 0xFFFF, 0x1E6D, 0xFFFF, 0x1E6F, 0xFFFF, 0x1E71, 0xFFFF, 0x1E73, 0xFFFF, 0x1E75, 0xFFFF, + 0x1E77, 0xFFFF, 0x1E79, 0xFFFF, 0x1E7B, 0xFFFF, 0x1E7D, 0xFFFF, 0x1E7F, 0xFFFF}; +static unsigned short unicode_lower_data35[] = { + 0x1E81, 0xFFFF, 0x1E83, 0xFFFF, 0x1E85, 0xFFFF, 0x1E87, 0xFFFF, 0x1E89, 0xFFFF, 0x1E8B, + 0xFFFF, 0x1E8D, 0xFFFF, 0x1E8F, 0xFFFF, 0x1E91, 0xFFFF, 0x1E93, 0xFFFF, 0x1E95, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x00DF, 0xFFFF}; +static unsigned short unicode_lower_data36[] = { + 0x1EA1, 0xFFFF, 0x1EA3, 0xFFFF, 0x1EA5, 0xFFFF, 0x1EA7, 0xFFFF, 0x1EA9, 0xFFFF, 0x1EAB, + 0xFFFF, 0x1EAD, 0xFFFF, 0x1EAF, 0xFFFF, 0x1EB1, 0xFFFF, 0x1EB3, 0xFFFF, 0x1EB5, 0xFFFF, + 0x1EB7, 0xFFFF, 0x1EB9, 0xFFFF, 0x1EBB, 0xFFFF, 0x1EBD, 0xFFFF, 0x1EBF, 0xFFFF}; +static unsigned short unicode_lower_data37[] = { + 0x1EC1, 0xFFFF, 0x1EC3, 0xFFFF, 0x1EC5, 0xFFFF, 0x1EC7, 0xFFFF, 0x1EC9, 0xFFFF, 0x1ECB, + 0xFFFF, 0x1ECD, 0xFFFF, 0x1ECF, 0xFFFF, 0x1ED1, 0xFFFF, 0x1ED3, 0xFFFF, 0x1ED5, 0xFFFF, + 0x1ED7, 0xFFFF, 0x1ED9, 0xFFFF, 0x1EDB, 0xFFFF, 0x1EDD, 0xFFFF, 0x1EDF, 0xFFFF}; +static unsigned short unicode_lower_data38[] = { + 0x1EE1, 0xFFFF, 0x1EE3, 0xFFFF, 0x1EE5, 0xFFFF, 0x1EE7, 0xFFFF, 0x1EE9, 0xFFFF, 0x1EEB, + 0xFFFF, 0x1EED, 0xFFFF, 0x1EEF, 0xFFFF, 0x1EF1, 0xFFFF, 0x1EF3, 0xFFFF, 0x1EF5, 0xFFFF, + 0x1EF7, 0xFFFF, 0x1EF9, 0xFFFF, 0x1EFB, 0xFFFF, 0x1EFD, 0xFFFF, 0x1EFF, 0xFFFF}; +static unsigned short unicode_lower_data39[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F00, 0x1F01, 0x1F02, + 0x1F03, 0x1F04, 0x1F05, 0x1F06, 0x1F07, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F10, 0x1F11, 0x1F12, 0x1F13, 0x1F14, 0x1F15, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data40[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F20, 0x1F21, 0x1F22, + 0x1F23, 0x1F24, 0x1F25, 0x1F26, 0x1F27, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F30, 0x1F31, 0x1F32, 0x1F33, 0x1F34, 0x1F35, 0x1F36, 0x1F37}; +static unsigned short unicode_lower_data41[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F40, 0x1F41, 0x1F42, + 0x1F43, 0x1F44, 0x1F45, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x1F51, 0xFFFF, 0x1F53, 0xFFFF, 0x1F55, 0xFFFF, 0x1F57}; +static unsigned short unicode_lower_data42[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F60, 0x1F61, 0x1F62, + 0x1F63, 0x1F64, 0x1F65, 0x1F66, 0x1F67, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data43[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F80, 0x1F81, 0x1F82, + 0x1F83, 0x1F84, 0x1F85, 0x1F86, 0x1F87, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F90, 0x1F91, 0x1F92, 0x1F93, 0x1F94, 0x1F95, 0x1F96, 0x1F97}; +static unsigned short unicode_lower_data44[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FA0, 0x1FA1, 0x1FA2, + 0x1FA3, 0x1FA4, 0x1FA5, 0x1FA6, 0x1FA7, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1FB0, 0x1FB1, 0x1F70, 0x1F71, 0x1FB3, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data45[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F72, 0x1F73, 0x1F74, + 0x1F75, 0x1FC3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1FD0, 0x1FD1, 0x1F76, 0x1F77, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data46[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FE0, 0x1FE1, 0x1F7A, + 0x1F7B, 0x1FE5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x1F78, 0x1F79, 0x1F7C, 0x1F7D, 0x1FF3, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data47[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03C9, 0xFFFF, 0xFFFF, 0xFFFF, 0x006B, + 0x00E5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x214E, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data48[] = { + 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217A, + 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data49[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x2184, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data50[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x24D0, 0x24D1, 0x24D2, 0x24D3, 0x24D4, 0x24D5, 0x24D6, 0x24D7, 0x24D8, 0x24D9}; +static unsigned short unicode_lower_data51[] = { + 0x24DA, 0x24DB, 0x24DC, 0x24DD, 0x24DE, 0x24DF, 0x24E0, 0x24E1, 0x24E2, 0x24E3, 0x24E4, + 0x24E5, 0x24E6, 0x24E7, 0x24E8, 0x24E9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data52[] = { + 0x2C30, 0x2C31, 0x2C32, 0x2C33, 0x2C34, 0x2C35, 0x2C36, 0x2C37, 0x2C38, 0x2C39, 0x2C3A, + 0x2C3B, 0x2C3C, 0x2C3D, 0x2C3E, 0x2C3F, 0x2C40, 0x2C41, 0x2C42, 0x2C43, 0x2C44, 0x2C45, + 0x2C46, 0x2C47, 0x2C48, 0x2C49, 0x2C4A, 0x2C4B, 0x2C4C, 0x2C4D, 0x2C4E, 0x2C4F}; +static unsigned short unicode_lower_data53[] = { + 0x2C50, 0x2C51, 0x2C52, 0x2C53, 0x2C54, 0x2C55, 0x2C56, 0x2C57, 0x2C58, 0x2C59, 0x2C5A, + 0x2C5B, 0x2C5C, 0x2C5D, 0x2C5E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data54[] = { + 0x2C61, 0xFFFF, 0x026B, 0x1D7D, 0x027D, 0xFFFF, 0xFFFF, 0x2C68, 0xFFFF, 0x2C6A, 0xFFFF, + 0x2C6C, 0xFFFF, 0x0251, 0x0271, 0x0250, 0xFFFF, 0xFFFF, 0x2C73, 0xFFFF, 0xFFFF, 0x2C76, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data55[] = { + 0x2C81, 0xFFFF, 0x2C83, 0xFFFF, 0x2C85, 0xFFFF, 0x2C87, 0xFFFF, 0x2C89, 0xFFFF, 0x2C8B, + 0xFFFF, 0x2C8D, 0xFFFF, 0x2C8F, 0xFFFF, 0x2C91, 0xFFFF, 0x2C93, 0xFFFF, 0x2C95, 0xFFFF, + 0x2C97, 0xFFFF, 0x2C99, 0xFFFF, 0x2C9B, 0xFFFF, 0x2C9D, 0xFFFF, 0x2C9F, 0xFFFF}; +static unsigned short unicode_lower_data56[] = { + 0x2CA1, 0xFFFF, 0x2CA3, 0xFFFF, 0x2CA5, 0xFFFF, 0x2CA7, 0xFFFF, 0x2CA9, 0xFFFF, 0x2CAB, + 0xFFFF, 0x2CAD, 0xFFFF, 0x2CAF, 0xFFFF, 0x2CB1, 0xFFFF, 0x2CB3, 0xFFFF, 0x2CB5, 0xFFFF, + 0x2CB7, 0xFFFF, 0x2CB9, 0xFFFF, 0x2CBB, 0xFFFF, 0x2CBD, 0xFFFF, 0x2CBF, 0xFFFF}; +static unsigned short unicode_lower_data57[] = { + 0x2CC1, 0xFFFF, 0x2CC3, 0xFFFF, 0x2CC5, 0xFFFF, 0x2CC7, 0xFFFF, 0x2CC9, 0xFFFF, 0x2CCB, + 0xFFFF, 0x2CCD, 0xFFFF, 0x2CCF, 0xFFFF, 0x2CD1, 0xFFFF, 0x2CD3, 0xFFFF, 0x2CD5, 0xFFFF, + 0x2CD7, 0xFFFF, 0x2CD9, 0xFFFF, 0x2CDB, 0xFFFF, 0x2CDD, 0xFFFF, 0x2CDF, 0xFFFF}; +static unsigned short unicode_lower_data58[] = { + 0x2CE1, 0xFFFF, 0x2CE3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data59[] = { + 0xA641, 0xFFFF, 0xA643, 0xFFFF, 0xA645, 0xFFFF, 0xA647, 0xFFFF, 0xA649, 0xFFFF, 0xA64B, + 0xFFFF, 0xA64D, 0xFFFF, 0xA64F, 0xFFFF, 0xA651, 0xFFFF, 0xA653, 0xFFFF, 0xA655, 0xFFFF, + 0xA657, 0xFFFF, 0xA659, 0xFFFF, 0xA65B, 0xFFFF, 0xA65D, 0xFFFF, 0xA65F, 0xFFFF}; +static unsigned short unicode_lower_data60[] = { + 0xFFFF, 0xFFFF, 0xA663, 0xFFFF, 0xA665, 0xFFFF, 0xA667, 0xFFFF, 0xA669, 0xFFFF, 0xA66B, + 0xFFFF, 0xA66D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data61[] = { + 0xA681, 0xFFFF, 0xA683, 0xFFFF, 0xA685, 0xFFFF, 0xA687, 0xFFFF, 0xA689, 0xFFFF, 0xA68B, + 0xFFFF, 0xA68D, 0xFFFF, 0xA68F, 0xFFFF, 0xA691, 0xFFFF, 0xA693, 0xFFFF, 0xA695, 0xFFFF, + 0xA697, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data62[] = { + 0xFFFF, 0xFFFF, 0xA723, 0xFFFF, 0xA725, 0xFFFF, 0xA727, 0xFFFF, 0xA729, 0xFFFF, 0xA72B, + 0xFFFF, 0xA72D, 0xFFFF, 0xA72F, 0xFFFF, 0xFFFF, 0xFFFF, 0xA733, 0xFFFF, 0xA735, 0xFFFF, + 0xA737, 0xFFFF, 0xA739, 0xFFFF, 0xA73B, 0xFFFF, 0xA73D, 0xFFFF, 0xA73F, 0xFFFF}; +static unsigned short unicode_lower_data63[] = { + 0xA741, 0xFFFF, 0xA743, 0xFFFF, 0xA745, 0xFFFF, 0xA747, 0xFFFF, 0xA749, 0xFFFF, 0xA74B, + 0xFFFF, 0xA74D, 0xFFFF, 0xA74F, 0xFFFF, 0xA751, 0xFFFF, 0xA753, 0xFFFF, 0xA755, 0xFFFF, + 0xA757, 0xFFFF, 0xA759, 0xFFFF, 0xA75B, 0xFFFF, 0xA75D, 0xFFFF, 0xA75F, 0xFFFF}; +static unsigned short unicode_lower_data64[] = { + 0xA761, 0xFFFF, 0xA763, 0xFFFF, 0xA765, 0xFFFF, 0xA767, 0xFFFF, 0xA769, 0xFFFF, 0xA76B, + 0xFFFF, 0xA76D, 0xFFFF, 0xA76F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xA77A, 0xFFFF, 0xA77C, 0xFFFF, 0x1D79, 0xA77F, 0xFFFF}; +static unsigned short unicode_lower_data65[] = { + 0xA781, 0xFFFF, 0xA783, 0xFFFF, 0xA785, 0xFFFF, 0xA787, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xA78C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_lower_data66[] = { + 0xFFFF, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, + 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, + 0xFF56, 0xFF57, 0xFF58, 0xFF59, 0xFF5A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; + +static unsigned short* unicode_lower_data_table[UNICODE_LOWER_BLOCK_COUNT] = { + unicode_lower_data0, unicode_lower_data1, unicode_lower_data2, unicode_lower_data3, + unicode_lower_data4, unicode_lower_data5, unicode_lower_data6, unicode_lower_data7, + unicode_lower_data8, unicode_lower_data9, unicode_lower_data10, unicode_lower_data11, + unicode_lower_data12, unicode_lower_data13, unicode_lower_data14, unicode_lower_data15, + unicode_lower_data16, unicode_lower_data17, unicode_lower_data18, unicode_lower_data19, + unicode_lower_data20, unicode_lower_data21, unicode_lower_data22, unicode_lower_data23, + unicode_lower_data24, unicode_lower_data25, unicode_lower_data26, unicode_lower_data27, + unicode_lower_data28, unicode_lower_data29, unicode_lower_data30, unicode_lower_data31, + unicode_lower_data32, unicode_lower_data33, unicode_lower_data34, unicode_lower_data35, + unicode_lower_data36, unicode_lower_data37, unicode_lower_data38, unicode_lower_data39, + unicode_lower_data40, unicode_lower_data41, unicode_lower_data42, unicode_lower_data43, + unicode_lower_data44, unicode_lower_data45, unicode_lower_data46, unicode_lower_data47, + unicode_lower_data48, unicode_lower_data49, unicode_lower_data50, unicode_lower_data51, + unicode_lower_data52, unicode_lower_data53, unicode_lower_data54, unicode_lower_data55, + unicode_lower_data56, unicode_lower_data57, unicode_lower_data58, unicode_lower_data59, + unicode_lower_data60, unicode_lower_data61, unicode_lower_data62, unicode_lower_data63, + unicode_lower_data64, unicode_lower_data65, unicode_lower_data66}; +/* Generated by builder. Do not modify. End unicode_lower_tables */ + +SQLITE_EXPORT u16 sqlite3_unicode_lower(u16 c) { + u16 index = unicode_lower_indexes[(c) >> UNICODE_LOWER_BLOCK_SHIFT]; + u8 position = (c)&UNICODE_LOWER_BLOCK_MASK; + u16(p) = (unicode_lower_data_table[index][unicode_lower_positions[index][position]]); + int l = unicode_lower_positions[index][position + 1] - unicode_lower_positions[index][position]; + + return ((l == 1) && ((p) == 0xFFFF)) ? c : p; +} +#endif + +#ifdef SQLITE3_UNICODE_UPPER +/* Generated by builder. Do not modify. Start unicode_upper_defines */ +#define UNICODE_UPPER_BLOCK_SHIFT 6 +#define UNICODE_UPPER_BLOCK_MASK ((1 << UNICODE_UPPER_BLOCK_SHIFT) - 1) +#define UNICODE_UPPER_BLOCK_SIZE (1 << UNICODE_UPPER_BLOCK_SHIFT) +#define UNICODE_UPPER_BLOCK_COUNT 44 +#define UNICODE_UPPER_INDEXES_SIZE (0x10000 >> UNICODE_UPPER_BLOCK_SHIFT) +/* Generated by builder. Do not modify. End unicode_upper_defines */ + +/* Generated by builder. Do not modify. Start unicode_upper_tables */ + +static unsigned short unicode_upper_indexes[UNICODE_UPPER_INDEXES_SIZE] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 22, 23, 24, 25, 26, 27, 28, 29, 0, 0, 0, 0, 0, 30, 31, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 34, 35, 36, 37, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 39, 0, 40, 41, 42, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0}; + +static unsigned char unicode_upper_positions[UNICODE_UPPER_BLOCK_COUNT][UNICODE_UPPER_BLOCK_SIZE + + 1] = { + /* 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 1 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 2 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 3 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 4 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 5 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 6 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 7 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 8 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 9 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 10 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 11 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 12 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 13 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 14 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 15 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 16 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 17 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 18 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 19 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 20 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 21 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 22 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 23 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 24 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 25 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 26 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 27 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 28 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 29 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 30 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 32 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 33 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 34 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 35 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 36 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 37 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 38 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 39 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 40 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 41 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 42 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 43 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}}; + +static unsigned short unicode_upper_data0[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data1[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, + 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, + 0x0057, 0x0058, 0x0059, 0x005A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data2[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x039C, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data3[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x00C0, + 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, + 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, + 0xFFFF, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x0178}; +static unsigned short unicode_upper_data4[] = { + 0xFFFF, 0x0100, 0xFFFF, 0x0102, 0xFFFF, 0x0104, 0xFFFF, 0x0106, 0xFFFF, 0x0108, 0xFFFF, + 0x010A, 0xFFFF, 0x010C, 0xFFFF, 0x010E, 0xFFFF, 0x0110, 0xFFFF, 0x0112, 0xFFFF, 0x0114, + 0xFFFF, 0x0116, 0xFFFF, 0x0118, 0xFFFF, 0x011A, 0xFFFF, 0x011C, 0xFFFF, 0x011E, 0xFFFF, + 0x0120, 0xFFFF, 0x0122, 0xFFFF, 0x0124, 0xFFFF, 0x0126, 0xFFFF, 0x0128, 0xFFFF, 0x012A, + 0xFFFF, 0x012C, 0xFFFF, 0x012E, 0xFFFF, 0x0049, 0xFFFF, 0x0132, 0xFFFF, 0x0134, 0xFFFF, + 0x0136, 0xFFFF, 0xFFFF, 0x0139, 0xFFFF, 0x013B, 0xFFFF, 0x013D, 0xFFFF}; +static unsigned short unicode_upper_data5[] = { + 0x013F, 0xFFFF, 0x0141, 0xFFFF, 0x0143, 0xFFFF, 0x0145, 0xFFFF, 0x0147, 0xFFFF, 0xFFFF, + 0x014A, 0xFFFF, 0x014C, 0xFFFF, 0x014E, 0xFFFF, 0x0150, 0xFFFF, 0x0152, 0xFFFF, 0x0154, + 0xFFFF, 0x0156, 0xFFFF, 0x0158, 0xFFFF, 0x015A, 0xFFFF, 0x015C, 0xFFFF, 0x015E, 0xFFFF, + 0x0160, 0xFFFF, 0x0162, 0xFFFF, 0x0164, 0xFFFF, 0x0166, 0xFFFF, 0x0168, 0xFFFF, 0x016A, + 0xFFFF, 0x016C, 0xFFFF, 0x016E, 0xFFFF, 0x0170, 0xFFFF, 0x0172, 0xFFFF, 0x0174, 0xFFFF, + 0x0176, 0xFFFF, 0xFFFF, 0x0179, 0xFFFF, 0x017B, 0xFFFF, 0x017D, 0x0053}; +static unsigned short unicode_upper_data6[] = { + 0x0243, 0xFFFF, 0xFFFF, 0x0182, 0xFFFF, 0x0184, 0xFFFF, 0xFFFF, 0x0187, 0xFFFF, 0xFFFF, + 0xFFFF, 0x018B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0191, 0xFFFF, 0xFFFF, 0x01F6, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0198, 0x023D, 0xFFFF, 0xFFFF, 0xFFFF, 0x0220, 0xFFFF, 0xFFFF, + 0x01A0, 0xFFFF, 0x01A2, 0xFFFF, 0x01A4, 0xFFFF, 0xFFFF, 0x01A7, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x01AC, 0xFFFF, 0xFFFF, 0x01AF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01B3, 0xFFFF, 0x01B5, + 0xFFFF, 0xFFFF, 0x01B8, 0xFFFF, 0xFFFF, 0xFFFF, 0x01BC, 0xFFFF, 0x01F7}; +static unsigned short unicode_upper_data7[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01C4, 0x01C4, 0xFFFF, 0x01C7, 0x01C7, 0xFFFF, + 0x01CA, 0x01CA, 0xFFFF, 0x01CD, 0xFFFF, 0x01CF, 0xFFFF, 0x01D1, 0xFFFF, 0x01D3, 0xFFFF, + 0x01D5, 0xFFFF, 0x01D7, 0xFFFF, 0x01D9, 0xFFFF, 0x01DB, 0x018E, 0xFFFF, 0x01DE, 0xFFFF, + 0x01E0, 0xFFFF, 0x01E2, 0xFFFF, 0x01E4, 0xFFFF, 0x01E6, 0xFFFF, 0x01E8, 0xFFFF, 0x01EA, + 0xFFFF, 0x01EC, 0xFFFF, 0x01EE, 0xFFFF, 0xFFFF, 0x01F1, 0x01F1, 0xFFFF, 0x01F4, 0xFFFF, + 0xFFFF, 0xFFFF, 0x01F8, 0xFFFF, 0x01FA, 0xFFFF, 0x01FC, 0xFFFF, 0x01FE}; +static unsigned short unicode_upper_data8[] = { + 0xFFFF, 0x0200, 0xFFFF, 0x0202, 0xFFFF, 0x0204, 0xFFFF, 0x0206, 0xFFFF, 0x0208, 0xFFFF, + 0x020A, 0xFFFF, 0x020C, 0xFFFF, 0x020E, 0xFFFF, 0x0210, 0xFFFF, 0x0212, 0xFFFF, 0x0214, + 0xFFFF, 0x0216, 0xFFFF, 0x0218, 0xFFFF, 0x021A, 0xFFFF, 0x021C, 0xFFFF, 0x021E, 0xFFFF, + 0xFFFF, 0xFFFF, 0x0222, 0xFFFF, 0x0224, 0xFFFF, 0x0226, 0xFFFF, 0x0228, 0xFFFF, 0x022A, + 0xFFFF, 0x022C, 0xFFFF, 0x022E, 0xFFFF, 0x0230, 0xFFFF, 0x0232, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x023B, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data9[] = { + 0xFFFF, 0xFFFF, 0x0241, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0246, 0xFFFF, 0x0248, 0xFFFF, + 0x024A, 0xFFFF, 0x024C, 0xFFFF, 0x024E, 0x2C6F, 0x2C6D, 0xFFFF, 0x0181, 0x0186, 0xFFFF, + 0x0189, 0x018A, 0xFFFF, 0x018F, 0xFFFF, 0x0190, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0193, + 0xFFFF, 0xFFFF, 0x0194, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0197, 0x0196, 0xFFFF, 0x2C62, + 0xFFFF, 0xFFFF, 0xFFFF, 0x019C, 0xFFFF, 0x2C6E, 0x019D, 0xFFFF, 0xFFFF, 0x019F, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C64, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data10[] = { + 0x01A6, 0xFFFF, 0xFFFF, 0x01A9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01AE, 0x0244, 0x01B1, + 0x01B2, 0x0245, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01B7, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data11[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0399, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0370, 0xFFFF, 0x0372, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0376, 0xFFFF, 0xFFFF, 0xFFFF, 0x03FD, 0x03FE, 0x03FF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data12[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0386, 0x0388, 0x0389, 0x038A, 0xFFFF, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, + 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F}; +static unsigned short unicode_upper_data13[] = { + 0x03A0, 0x03A1, 0x03A3, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, + 0x03AB, 0x038C, 0x038E, 0x038F, 0xFFFF, 0x0392, 0x0398, 0xFFFF, 0xFFFF, 0xFFFF, 0x03A6, + 0x03A0, 0x03CF, 0xFFFF, 0x03D8, 0xFFFF, 0x03DA, 0xFFFF, 0x03DC, 0xFFFF, 0x03DE, 0xFFFF, + 0x03E0, 0xFFFF, 0x03E2, 0xFFFF, 0x03E4, 0xFFFF, 0x03E6, 0xFFFF, 0x03E8, 0xFFFF, 0x03EA, + 0xFFFF, 0x03EC, 0xFFFF, 0x03EE, 0x039A, 0x03A1, 0x03F9, 0xFFFF, 0xFFFF, 0x0395, 0xFFFF, + 0xFFFF, 0x03F7, 0xFFFF, 0xFFFF, 0x03FA, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data14[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, + 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F}; +static unsigned short unicode_upper_data15[] = { + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, + 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, + 0x0406, 0x0407, 0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x040D, 0x040E, 0x040F, 0xFFFF, + 0x0460, 0xFFFF, 0x0462, 0xFFFF, 0x0464, 0xFFFF, 0x0466, 0xFFFF, 0x0468, 0xFFFF, 0x046A, + 0xFFFF, 0x046C, 0xFFFF, 0x046E, 0xFFFF, 0x0470, 0xFFFF, 0x0472, 0xFFFF, 0x0474, 0xFFFF, + 0x0476, 0xFFFF, 0x0478, 0xFFFF, 0x047A, 0xFFFF, 0x047C, 0xFFFF, 0x047E}; +static unsigned short unicode_upper_data16[] = { + 0xFFFF, 0x0480, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x048A, 0xFFFF, 0x048C, 0xFFFF, 0x048E, 0xFFFF, 0x0490, 0xFFFF, 0x0492, 0xFFFF, 0x0494, + 0xFFFF, 0x0496, 0xFFFF, 0x0498, 0xFFFF, 0x049A, 0xFFFF, 0x049C, 0xFFFF, 0x049E, 0xFFFF, + 0x04A0, 0xFFFF, 0x04A2, 0xFFFF, 0x04A4, 0xFFFF, 0x04A6, 0xFFFF, 0x04A8, 0xFFFF, 0x04AA, + 0xFFFF, 0x04AC, 0xFFFF, 0x04AE, 0xFFFF, 0x04B0, 0xFFFF, 0x04B2, 0xFFFF, 0x04B4, 0xFFFF, + 0x04B6, 0xFFFF, 0x04B8, 0xFFFF, 0x04BA, 0xFFFF, 0x04BC, 0xFFFF, 0x04BE}; +static unsigned short unicode_upper_data17[] = { + 0xFFFF, 0xFFFF, 0x04C1, 0xFFFF, 0x04C3, 0xFFFF, 0x04C5, 0xFFFF, 0x04C7, 0xFFFF, 0x04C9, + 0xFFFF, 0x04CB, 0xFFFF, 0x04CD, 0x04C0, 0xFFFF, 0x04D0, 0xFFFF, 0x04D2, 0xFFFF, 0x04D4, + 0xFFFF, 0x04D6, 0xFFFF, 0x04D8, 0xFFFF, 0x04DA, 0xFFFF, 0x04DC, 0xFFFF, 0x04DE, 0xFFFF, + 0x04E0, 0xFFFF, 0x04E2, 0xFFFF, 0x04E4, 0xFFFF, 0x04E6, 0xFFFF, 0x04E8, 0xFFFF, 0x04EA, + 0xFFFF, 0x04EC, 0xFFFF, 0x04EE, 0xFFFF, 0x04F0, 0xFFFF, 0x04F2, 0xFFFF, 0x04F4, 0xFFFF, + 0x04F6, 0xFFFF, 0x04F8, 0xFFFF, 0x04FA, 0xFFFF, 0x04FC, 0xFFFF, 0x04FE}; +static unsigned short unicode_upper_data18[] = { + 0xFFFF, 0x0500, 0xFFFF, 0x0502, 0xFFFF, 0x0504, 0xFFFF, 0x0506, 0xFFFF, 0x0508, 0xFFFF, + 0x050A, 0xFFFF, 0x050C, 0xFFFF, 0x050E, 0xFFFF, 0x0510, 0xFFFF, 0x0512, 0xFFFF, 0x0514, + 0xFFFF, 0x0516, 0xFFFF, 0x0518, 0xFFFF, 0x051A, 0xFFFF, 0x051C, 0xFFFF, 0x051E, 0xFFFF, + 0x0520, 0xFFFF, 0x0522, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data19[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538, 0x0539, 0x053A, 0x053B, + 0x053C, 0x053D, 0x053E, 0x053F, 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, + 0x0547, 0x0548, 0x0549, 0x054A, 0x054B, 0x054C, 0x054D, 0x054E, 0x054F}; +static unsigned short unicode_upper_data20[] = { + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data21[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xA77D, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C63, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data22[] = { + 0xFFFF, 0x1E00, 0xFFFF, 0x1E02, 0xFFFF, 0x1E04, 0xFFFF, 0x1E06, 0xFFFF, 0x1E08, 0xFFFF, + 0x1E0A, 0xFFFF, 0x1E0C, 0xFFFF, 0x1E0E, 0xFFFF, 0x1E10, 0xFFFF, 0x1E12, 0xFFFF, 0x1E14, + 0xFFFF, 0x1E16, 0xFFFF, 0x1E18, 0xFFFF, 0x1E1A, 0xFFFF, 0x1E1C, 0xFFFF, 0x1E1E, 0xFFFF, + 0x1E20, 0xFFFF, 0x1E22, 0xFFFF, 0x1E24, 0xFFFF, 0x1E26, 0xFFFF, 0x1E28, 0xFFFF, 0x1E2A, + 0xFFFF, 0x1E2C, 0xFFFF, 0x1E2E, 0xFFFF, 0x1E30, 0xFFFF, 0x1E32, 0xFFFF, 0x1E34, 0xFFFF, + 0x1E36, 0xFFFF, 0x1E38, 0xFFFF, 0x1E3A, 0xFFFF, 0x1E3C, 0xFFFF, 0x1E3E}; +static unsigned short unicode_upper_data23[] = { + 0xFFFF, 0x1E40, 0xFFFF, 0x1E42, 0xFFFF, 0x1E44, 0xFFFF, 0x1E46, 0xFFFF, 0x1E48, 0xFFFF, + 0x1E4A, 0xFFFF, 0x1E4C, 0xFFFF, 0x1E4E, 0xFFFF, 0x1E50, 0xFFFF, 0x1E52, 0xFFFF, 0x1E54, + 0xFFFF, 0x1E56, 0xFFFF, 0x1E58, 0xFFFF, 0x1E5A, 0xFFFF, 0x1E5C, 0xFFFF, 0x1E5E, 0xFFFF, + 0x1E60, 0xFFFF, 0x1E62, 0xFFFF, 0x1E64, 0xFFFF, 0x1E66, 0xFFFF, 0x1E68, 0xFFFF, 0x1E6A, + 0xFFFF, 0x1E6C, 0xFFFF, 0x1E6E, 0xFFFF, 0x1E70, 0xFFFF, 0x1E72, 0xFFFF, 0x1E74, 0xFFFF, + 0x1E76, 0xFFFF, 0x1E78, 0xFFFF, 0x1E7A, 0xFFFF, 0x1E7C, 0xFFFF, 0x1E7E}; +static unsigned short unicode_upper_data24[] = { + 0xFFFF, 0x1E80, 0xFFFF, 0x1E82, 0xFFFF, 0x1E84, 0xFFFF, 0x1E86, 0xFFFF, 0x1E88, 0xFFFF, + 0x1E8A, 0xFFFF, 0x1E8C, 0xFFFF, 0x1E8E, 0xFFFF, 0x1E90, 0xFFFF, 0x1E92, 0xFFFF, 0x1E94, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1E60, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x1EA0, 0xFFFF, 0x1EA2, 0xFFFF, 0x1EA4, 0xFFFF, 0x1EA6, 0xFFFF, 0x1EA8, 0xFFFF, 0x1EAA, + 0xFFFF, 0x1EAC, 0xFFFF, 0x1EAE, 0xFFFF, 0x1EB0, 0xFFFF, 0x1EB2, 0xFFFF, 0x1EB4, 0xFFFF, + 0x1EB6, 0xFFFF, 0x1EB8, 0xFFFF, 0x1EBA, 0xFFFF, 0x1EBC, 0xFFFF, 0x1EBE}; +static unsigned short unicode_upper_data25[] = { + 0xFFFF, 0x1EC0, 0xFFFF, 0x1EC2, 0xFFFF, 0x1EC4, 0xFFFF, 0x1EC6, 0xFFFF, 0x1EC8, 0xFFFF, + 0x1ECA, 0xFFFF, 0x1ECC, 0xFFFF, 0x1ECE, 0xFFFF, 0x1ED0, 0xFFFF, 0x1ED2, 0xFFFF, 0x1ED4, + 0xFFFF, 0x1ED6, 0xFFFF, 0x1ED8, 0xFFFF, 0x1EDA, 0xFFFF, 0x1EDC, 0xFFFF, 0x1EDE, 0xFFFF, + 0x1EE0, 0xFFFF, 0x1EE2, 0xFFFF, 0x1EE4, 0xFFFF, 0x1EE6, 0xFFFF, 0x1EE8, 0xFFFF, 0x1EEA, + 0xFFFF, 0x1EEC, 0xFFFF, 0x1EEE, 0xFFFF, 0x1EF0, 0xFFFF, 0x1EF2, 0xFFFF, 0x1EF4, 0xFFFF, + 0x1EF6, 0xFFFF, 0x1EF8, 0xFFFF, 0x1EFA, 0xFFFF, 0x1EFC, 0xFFFF, 0x1EFE}; +static unsigned short unicode_upper_data26[] = { + 0x1F08, 0x1F09, 0x1F0A, 0x1F0B, 0x1F0C, 0x1F0D, 0x1F0E, 0x1F0F, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F18, 0x1F19, 0x1F1A, 0x1F1B, 0x1F1C, 0x1F1D, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F28, + 0x1F29, 0x1F2A, 0x1F2B, 0x1F2C, 0x1F2D, 0x1F2E, 0x1F2F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F38, 0x1F39, 0x1F3A, 0x1F3B, 0x1F3C, 0x1F3D, 0x1F3E, + 0x1F3F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data27[] = { + 0x1F48, 0x1F49, 0x1F4A, 0x1F4B, 0x1F4C, 0x1F4D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F59, 0xFFFF, 0x1F5B, 0xFFFF, 0x1F5D, + 0xFFFF, 0x1F5F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F68, + 0x1F69, 0x1F6A, 0x1F6B, 0x1F6C, 0x1F6D, 0x1F6E, 0x1F6F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FBA, 0x1FBB, 0x1FC8, 0x1FC9, 0x1FCA, 0x1FCB, 0x1FDA, + 0x1FDB, 0x1FF8, 0x1FF9, 0x1FEA, 0x1FEB, 0x1FFA, 0x1FFB, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data28[] = { + 0x1F88, 0x1F89, 0x1F8A, 0x1F8B, 0x1F8C, 0x1F8D, 0x1F8E, 0x1F8F, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F98, 0x1F99, 0x1F9A, 0x1F9B, 0x1F9C, 0x1F9D, + 0x1F9E, 0x1F9F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FA8, + 0x1FA9, 0x1FAA, 0x1FAB, 0x1FAC, 0x1FAD, 0x1FAE, 0x1FAF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FB8, 0x1FB9, 0xFFFF, 0x1FBC, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0399, 0xFFFF}; +static unsigned short unicode_upper_data29[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x1FCC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FD8, 0x1FD9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FE8, + 0x1FE9, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FEC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FFC, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data30[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x2132, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, + 0x2167, 0x2168, 0x2169, 0x216A, 0x216B, 0x216C, 0x216D, 0x216E, 0x216F}; +static unsigned short unicode_upper_data31[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2183, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data32[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x24B6, 0x24B7, 0x24B8, 0x24B9, 0x24BA, 0x24BB, + 0x24BC, 0x24BD, 0x24BE, 0x24BF, 0x24C0, 0x24C1, 0x24C2, 0x24C3, 0x24C4, 0x24C5, 0x24C6, + 0x24C7, 0x24C8, 0x24C9, 0x24CA, 0x24CB, 0x24CC, 0x24CD, 0x24CE, 0x24CF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data33[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C00, 0x2C01, 0x2C02, 0x2C03, 0x2C04, 0x2C05, 0x2C06, + 0x2C07, 0x2C08, 0x2C09, 0x2C0A, 0x2C0B, 0x2C0C, 0x2C0D, 0x2C0E, 0x2C0F}; +static unsigned short unicode_upper_data34[] = { + 0x2C10, 0x2C11, 0x2C12, 0x2C13, 0x2C14, 0x2C15, 0x2C16, 0x2C17, 0x2C18, 0x2C19, 0x2C1A, + 0x2C1B, 0x2C1C, 0x2C1D, 0x2C1E, 0x2C1F, 0x2C20, 0x2C21, 0x2C22, 0x2C23, 0x2C24, 0x2C25, + 0x2C26, 0x2C27, 0x2C28, 0x2C29, 0x2C2A, 0x2C2B, 0x2C2C, 0x2C2D, 0x2C2E, 0xFFFF, 0xFFFF, + 0x2C60, 0xFFFF, 0xFFFF, 0xFFFF, 0x023A, 0x023E, 0xFFFF, 0x2C67, 0xFFFF, 0x2C69, 0xFFFF, + 0x2C6B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C72, 0xFFFF, 0xFFFF, 0x2C75, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data35[] = { + 0xFFFF, 0x2C80, 0xFFFF, 0x2C82, 0xFFFF, 0x2C84, 0xFFFF, 0x2C86, 0xFFFF, 0x2C88, 0xFFFF, + 0x2C8A, 0xFFFF, 0x2C8C, 0xFFFF, 0x2C8E, 0xFFFF, 0x2C90, 0xFFFF, 0x2C92, 0xFFFF, 0x2C94, + 0xFFFF, 0x2C96, 0xFFFF, 0x2C98, 0xFFFF, 0x2C9A, 0xFFFF, 0x2C9C, 0xFFFF, 0x2C9E, 0xFFFF, + 0x2CA0, 0xFFFF, 0x2CA2, 0xFFFF, 0x2CA4, 0xFFFF, 0x2CA6, 0xFFFF, 0x2CA8, 0xFFFF, 0x2CAA, + 0xFFFF, 0x2CAC, 0xFFFF, 0x2CAE, 0xFFFF, 0x2CB0, 0xFFFF, 0x2CB2, 0xFFFF, 0x2CB4, 0xFFFF, + 0x2CB6, 0xFFFF, 0x2CB8, 0xFFFF, 0x2CBA, 0xFFFF, 0x2CBC, 0xFFFF, 0x2CBE}; +static unsigned short unicode_upper_data36[] = { + 0xFFFF, 0x2CC0, 0xFFFF, 0x2CC2, 0xFFFF, 0x2CC4, 0xFFFF, 0x2CC6, 0xFFFF, 0x2CC8, 0xFFFF, + 0x2CCA, 0xFFFF, 0x2CCC, 0xFFFF, 0x2CCE, 0xFFFF, 0x2CD0, 0xFFFF, 0x2CD2, 0xFFFF, 0x2CD4, + 0xFFFF, 0x2CD6, 0xFFFF, 0x2CD8, 0xFFFF, 0x2CDA, 0xFFFF, 0x2CDC, 0xFFFF, 0x2CDE, 0xFFFF, + 0x2CE0, 0xFFFF, 0x2CE2, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data37[] = { + 0x10A0, 0x10A1, 0x10A2, 0x10A3, 0x10A4, 0x10A5, 0x10A6, 0x10A7, 0x10A8, 0x10A9, 0x10AA, + 0x10AB, 0x10AC, 0x10AD, 0x10AE, 0x10AF, 0x10B0, 0x10B1, 0x10B2, 0x10B3, 0x10B4, 0x10B5, + 0x10B6, 0x10B7, 0x10B8, 0x10B9, 0x10BA, 0x10BB, 0x10BC, 0x10BD, 0x10BE, 0x10BF, 0x10C0, + 0x10C1, 0x10C2, 0x10C3, 0x10C4, 0x10C5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data38[] = { + 0xFFFF, 0xA640, 0xFFFF, 0xA642, 0xFFFF, 0xA644, 0xFFFF, 0xA646, 0xFFFF, 0xA648, 0xFFFF, + 0xA64A, 0xFFFF, 0xA64C, 0xFFFF, 0xA64E, 0xFFFF, 0xA650, 0xFFFF, 0xA652, 0xFFFF, 0xA654, + 0xFFFF, 0xA656, 0xFFFF, 0xA658, 0xFFFF, 0xA65A, 0xFFFF, 0xA65C, 0xFFFF, 0xA65E, 0xFFFF, + 0xFFFF, 0xFFFF, 0xA662, 0xFFFF, 0xA664, 0xFFFF, 0xA666, 0xFFFF, 0xA668, 0xFFFF, 0xA66A, + 0xFFFF, 0xA66C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data39[] = { + 0xFFFF, 0xA680, 0xFFFF, 0xA682, 0xFFFF, 0xA684, 0xFFFF, 0xA686, 0xFFFF, 0xA688, 0xFFFF, + 0xA68A, 0xFFFF, 0xA68C, 0xFFFF, 0xA68E, 0xFFFF, 0xA690, 0xFFFF, 0xA692, 0xFFFF, 0xA694, + 0xFFFF, 0xA696, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data40[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xA722, 0xFFFF, 0xA724, 0xFFFF, 0xA726, 0xFFFF, 0xA728, 0xFFFF, 0xA72A, + 0xFFFF, 0xA72C, 0xFFFF, 0xA72E, 0xFFFF, 0xFFFF, 0xFFFF, 0xA732, 0xFFFF, 0xA734, 0xFFFF, + 0xA736, 0xFFFF, 0xA738, 0xFFFF, 0xA73A, 0xFFFF, 0xA73C, 0xFFFF, 0xA73E}; +static unsigned short unicode_upper_data41[] = { + 0xFFFF, 0xA740, 0xFFFF, 0xA742, 0xFFFF, 0xA744, 0xFFFF, 0xA746, 0xFFFF, 0xA748, 0xFFFF, + 0xA74A, 0xFFFF, 0xA74C, 0xFFFF, 0xA74E, 0xFFFF, 0xA750, 0xFFFF, 0xA752, 0xFFFF, 0xA754, + 0xFFFF, 0xA756, 0xFFFF, 0xA758, 0xFFFF, 0xA75A, 0xFFFF, 0xA75C, 0xFFFF, 0xA75E, 0xFFFF, + 0xA760, 0xFFFF, 0xA762, 0xFFFF, 0xA764, 0xFFFF, 0xA766, 0xFFFF, 0xA768, 0xFFFF, 0xA76A, + 0xFFFF, 0xA76C, 0xFFFF, 0xA76E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xA779, 0xFFFF, 0xA77B, 0xFFFF, 0xFFFF, 0xA77E}; +static unsigned short unicode_upper_data42[] = { + 0xFFFF, 0xA780, 0xFFFF, 0xA782, 0xFFFF, 0xA784, 0xFFFF, 0xA786, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xA78B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_upper_data43[] = { + 0xFFFF, 0xFF21, 0xFF22, 0xFF23, 0xFF24, 0xFF25, 0xFF26, 0xFF27, 0xFF28, 0xFF29, 0xFF2A, + 0xFF2B, 0xFF2C, 0xFF2D, 0xFF2E, 0xFF2F, 0xFF30, 0xFF31, 0xFF32, 0xFF33, 0xFF34, 0xFF35, + 0xFF36, 0xFF37, 0xFF38, 0xFF39, 0xFF3A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; + +static unsigned short* unicode_upper_data_table[UNICODE_UPPER_BLOCK_COUNT] = { + unicode_upper_data0, unicode_upper_data1, unicode_upper_data2, unicode_upper_data3, + unicode_upper_data4, unicode_upper_data5, unicode_upper_data6, unicode_upper_data7, + unicode_upper_data8, unicode_upper_data9, unicode_upper_data10, unicode_upper_data11, + unicode_upper_data12, unicode_upper_data13, unicode_upper_data14, unicode_upper_data15, + unicode_upper_data16, unicode_upper_data17, unicode_upper_data18, unicode_upper_data19, + unicode_upper_data20, unicode_upper_data21, unicode_upper_data22, unicode_upper_data23, + unicode_upper_data24, unicode_upper_data25, unicode_upper_data26, unicode_upper_data27, + unicode_upper_data28, unicode_upper_data29, unicode_upper_data30, unicode_upper_data31, + unicode_upper_data32, unicode_upper_data33, unicode_upper_data34, unicode_upper_data35, + unicode_upper_data36, unicode_upper_data37, unicode_upper_data38, unicode_upper_data39, + unicode_upper_data40, unicode_upper_data41, unicode_upper_data42, unicode_upper_data43}; +/* Generated by builder. Do not modify. End unicode_upper_tables */ + +SQLITE_EXPORT u16 sqlite3_unicode_upper(u16 c) { + u16 index = unicode_upper_indexes[(c) >> UNICODE_UPPER_BLOCK_SHIFT]; + u8 position = (c)&UNICODE_UPPER_BLOCK_MASK; + u16(p) = (unicode_upper_data_table[index][unicode_upper_positions[index][position]]); + int l = unicode_upper_positions[index][position + 1] - unicode_upper_positions[index][position]; + + return ((l == 1) && ((p) == 0xFFFF)) ? c : p; +} +#endif + +#ifdef SQLITE3_UNICODE_TITLE +/* Generated by builder. Do not modify. Start unicode_title_defines */ +#define UNICODE_TITLE_BLOCK_SHIFT 6 +#define UNICODE_TITLE_BLOCK_MASK ((1 << UNICODE_TITLE_BLOCK_SHIFT) - 1) +#define UNICODE_TITLE_BLOCK_SIZE (1 << UNICODE_TITLE_BLOCK_SHIFT) +#define UNICODE_TITLE_BLOCK_COUNT 44 +#define UNICODE_TITLE_INDEXES_SIZE (0x10000 >> UNICODE_TITLE_BLOCK_SHIFT) +/* Generated by builder. Do not modify. End unicode_title_defines */ + +/* Generated by builder. Do not modify. Start unicode_title_tables */ + +static unsigned short unicode_title_indexes[UNICODE_TITLE_INDEXES_SIZE] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 22, 23, 24, 25, 26, 27, 28, 29, 0, 0, 0, 0, 0, 30, 31, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 34, 35, 36, 37, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 39, 0, 40, 41, 42, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0}; + +static unsigned char unicode_title_positions[UNICODE_TITLE_BLOCK_COUNT][UNICODE_TITLE_BLOCK_SIZE + + 1] = { + /* 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 1 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 2 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 3 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 4 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 5 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 6 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 7 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 8 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 9 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 10 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 11 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 12 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 13 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 14 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 15 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 16 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 17 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 18 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 19 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 20 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 21 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 22 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 23 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 24 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 25 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 26 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 27 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 28 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 29 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 30 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 32 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 33 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 34 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 35 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 36 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 37 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 38 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 39 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 40 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 41 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 42 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}, + /* 43 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}}; + +static unsigned short unicode_title_data0[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data1[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, + 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, + 0x0057, 0x0058, 0x0059, 0x005A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data2[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x039C, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data3[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x00C0, + 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, + 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, + 0xFFFF, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x0178}; +static unsigned short unicode_title_data4[] = { + 0xFFFF, 0x0100, 0xFFFF, 0x0102, 0xFFFF, 0x0104, 0xFFFF, 0x0106, 0xFFFF, 0x0108, 0xFFFF, + 0x010A, 0xFFFF, 0x010C, 0xFFFF, 0x010E, 0xFFFF, 0x0110, 0xFFFF, 0x0112, 0xFFFF, 0x0114, + 0xFFFF, 0x0116, 0xFFFF, 0x0118, 0xFFFF, 0x011A, 0xFFFF, 0x011C, 0xFFFF, 0x011E, 0xFFFF, + 0x0120, 0xFFFF, 0x0122, 0xFFFF, 0x0124, 0xFFFF, 0x0126, 0xFFFF, 0x0128, 0xFFFF, 0x012A, + 0xFFFF, 0x012C, 0xFFFF, 0x012E, 0xFFFF, 0x0049, 0xFFFF, 0x0132, 0xFFFF, 0x0134, 0xFFFF, + 0x0136, 0xFFFF, 0xFFFF, 0x0139, 0xFFFF, 0x013B, 0xFFFF, 0x013D, 0xFFFF}; +static unsigned short unicode_title_data5[] = { + 0x013F, 0xFFFF, 0x0141, 0xFFFF, 0x0143, 0xFFFF, 0x0145, 0xFFFF, 0x0147, 0xFFFF, 0xFFFF, + 0x014A, 0xFFFF, 0x014C, 0xFFFF, 0x014E, 0xFFFF, 0x0150, 0xFFFF, 0x0152, 0xFFFF, 0x0154, + 0xFFFF, 0x0156, 0xFFFF, 0x0158, 0xFFFF, 0x015A, 0xFFFF, 0x015C, 0xFFFF, 0x015E, 0xFFFF, + 0x0160, 0xFFFF, 0x0162, 0xFFFF, 0x0164, 0xFFFF, 0x0166, 0xFFFF, 0x0168, 0xFFFF, 0x016A, + 0xFFFF, 0x016C, 0xFFFF, 0x016E, 0xFFFF, 0x0170, 0xFFFF, 0x0172, 0xFFFF, 0x0174, 0xFFFF, + 0x0176, 0xFFFF, 0xFFFF, 0x0179, 0xFFFF, 0x017B, 0xFFFF, 0x017D, 0x0053}; +static unsigned short unicode_title_data6[] = { + 0x0243, 0xFFFF, 0xFFFF, 0x0182, 0xFFFF, 0x0184, 0xFFFF, 0xFFFF, 0x0187, 0xFFFF, 0xFFFF, + 0xFFFF, 0x018B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0191, 0xFFFF, 0xFFFF, 0x01F6, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0198, 0x023D, 0xFFFF, 0xFFFF, 0xFFFF, 0x0220, 0xFFFF, 0xFFFF, + 0x01A0, 0xFFFF, 0x01A2, 0xFFFF, 0x01A4, 0xFFFF, 0xFFFF, 0x01A7, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x01AC, 0xFFFF, 0xFFFF, 0x01AF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01B3, 0xFFFF, 0x01B5, + 0xFFFF, 0xFFFF, 0x01B8, 0xFFFF, 0xFFFF, 0xFFFF, 0x01BC, 0xFFFF, 0x01F7}; +static unsigned short unicode_title_data7[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01C5, 0x01C5, 0x01C5, 0x01C8, 0x01C8, 0x01C8, 0x01CB, + 0x01CB, 0x01CB, 0xFFFF, 0x01CD, 0xFFFF, 0x01CF, 0xFFFF, 0x01D1, 0xFFFF, 0x01D3, 0xFFFF, + 0x01D5, 0xFFFF, 0x01D7, 0xFFFF, 0x01D9, 0xFFFF, 0x01DB, 0x018E, 0xFFFF, 0x01DE, 0xFFFF, + 0x01E0, 0xFFFF, 0x01E2, 0xFFFF, 0x01E4, 0xFFFF, 0x01E6, 0xFFFF, 0x01E8, 0xFFFF, 0x01EA, + 0xFFFF, 0x01EC, 0xFFFF, 0x01EE, 0xFFFF, 0x01F2, 0x01F2, 0x01F2, 0xFFFF, 0x01F4, 0xFFFF, + 0xFFFF, 0xFFFF, 0x01F8, 0xFFFF, 0x01FA, 0xFFFF, 0x01FC, 0xFFFF, 0x01FE}; +static unsigned short unicode_title_data8[] = { + 0xFFFF, 0x0200, 0xFFFF, 0x0202, 0xFFFF, 0x0204, 0xFFFF, 0x0206, 0xFFFF, 0x0208, 0xFFFF, + 0x020A, 0xFFFF, 0x020C, 0xFFFF, 0x020E, 0xFFFF, 0x0210, 0xFFFF, 0x0212, 0xFFFF, 0x0214, + 0xFFFF, 0x0216, 0xFFFF, 0x0218, 0xFFFF, 0x021A, 0xFFFF, 0x021C, 0xFFFF, 0x021E, 0xFFFF, + 0xFFFF, 0xFFFF, 0x0222, 0xFFFF, 0x0224, 0xFFFF, 0x0226, 0xFFFF, 0x0228, 0xFFFF, 0x022A, + 0xFFFF, 0x022C, 0xFFFF, 0x022E, 0xFFFF, 0x0230, 0xFFFF, 0x0232, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x023B, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data9[] = { + 0xFFFF, 0xFFFF, 0x0241, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0246, 0xFFFF, 0x0248, 0xFFFF, + 0x024A, 0xFFFF, 0x024C, 0xFFFF, 0x024E, 0x2C6F, 0x2C6D, 0xFFFF, 0x0181, 0x0186, 0xFFFF, + 0x0189, 0x018A, 0xFFFF, 0x018F, 0xFFFF, 0x0190, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0193, + 0xFFFF, 0xFFFF, 0x0194, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0197, 0x0196, 0xFFFF, 0x2C62, + 0xFFFF, 0xFFFF, 0xFFFF, 0x019C, 0xFFFF, 0x2C6E, 0x019D, 0xFFFF, 0xFFFF, 0x019F, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C64, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data10[] = { + 0x01A6, 0xFFFF, 0xFFFF, 0x01A9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01AE, 0x0244, 0x01B1, + 0x01B2, 0x0245, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x01B7, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data11[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0399, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0370, 0xFFFF, 0x0372, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0376, 0xFFFF, 0xFFFF, 0xFFFF, 0x03FD, 0x03FE, 0x03FF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data12[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0386, 0x0388, 0x0389, 0x038A, 0xFFFF, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, + 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F}; +static unsigned short unicode_title_data13[] = { + 0x03A0, 0x03A1, 0x03A3, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, + 0x03AB, 0x038C, 0x038E, 0x038F, 0xFFFF, 0x0392, 0x0398, 0xFFFF, 0xFFFF, 0xFFFF, 0x03A6, + 0x03A0, 0x03CF, 0xFFFF, 0x03D8, 0xFFFF, 0x03DA, 0xFFFF, 0x03DC, 0xFFFF, 0x03DE, 0xFFFF, + 0x03E0, 0xFFFF, 0x03E2, 0xFFFF, 0x03E4, 0xFFFF, 0x03E6, 0xFFFF, 0x03E8, 0xFFFF, 0x03EA, + 0xFFFF, 0x03EC, 0xFFFF, 0x03EE, 0x039A, 0x03A1, 0x03F9, 0xFFFF, 0xFFFF, 0x0395, 0xFFFF, + 0xFFFF, 0x03F7, 0xFFFF, 0xFFFF, 0x03FA, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data14[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, + 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F}; +static unsigned short unicode_title_data15[] = { + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, + 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, + 0x0406, 0x0407, 0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x040D, 0x040E, 0x040F, 0xFFFF, + 0x0460, 0xFFFF, 0x0462, 0xFFFF, 0x0464, 0xFFFF, 0x0466, 0xFFFF, 0x0468, 0xFFFF, 0x046A, + 0xFFFF, 0x046C, 0xFFFF, 0x046E, 0xFFFF, 0x0470, 0xFFFF, 0x0472, 0xFFFF, 0x0474, 0xFFFF, + 0x0476, 0xFFFF, 0x0478, 0xFFFF, 0x047A, 0xFFFF, 0x047C, 0xFFFF, 0x047E}; +static unsigned short unicode_title_data16[] = { + 0xFFFF, 0x0480, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x048A, 0xFFFF, 0x048C, 0xFFFF, 0x048E, 0xFFFF, 0x0490, 0xFFFF, 0x0492, 0xFFFF, 0x0494, + 0xFFFF, 0x0496, 0xFFFF, 0x0498, 0xFFFF, 0x049A, 0xFFFF, 0x049C, 0xFFFF, 0x049E, 0xFFFF, + 0x04A0, 0xFFFF, 0x04A2, 0xFFFF, 0x04A4, 0xFFFF, 0x04A6, 0xFFFF, 0x04A8, 0xFFFF, 0x04AA, + 0xFFFF, 0x04AC, 0xFFFF, 0x04AE, 0xFFFF, 0x04B0, 0xFFFF, 0x04B2, 0xFFFF, 0x04B4, 0xFFFF, + 0x04B6, 0xFFFF, 0x04B8, 0xFFFF, 0x04BA, 0xFFFF, 0x04BC, 0xFFFF, 0x04BE}; +static unsigned short unicode_title_data17[] = { + 0xFFFF, 0xFFFF, 0x04C1, 0xFFFF, 0x04C3, 0xFFFF, 0x04C5, 0xFFFF, 0x04C7, 0xFFFF, 0x04C9, + 0xFFFF, 0x04CB, 0xFFFF, 0x04CD, 0x04C0, 0xFFFF, 0x04D0, 0xFFFF, 0x04D2, 0xFFFF, 0x04D4, + 0xFFFF, 0x04D6, 0xFFFF, 0x04D8, 0xFFFF, 0x04DA, 0xFFFF, 0x04DC, 0xFFFF, 0x04DE, 0xFFFF, + 0x04E0, 0xFFFF, 0x04E2, 0xFFFF, 0x04E4, 0xFFFF, 0x04E6, 0xFFFF, 0x04E8, 0xFFFF, 0x04EA, + 0xFFFF, 0x04EC, 0xFFFF, 0x04EE, 0xFFFF, 0x04F0, 0xFFFF, 0x04F2, 0xFFFF, 0x04F4, 0xFFFF, + 0x04F6, 0xFFFF, 0x04F8, 0xFFFF, 0x04FA, 0xFFFF, 0x04FC, 0xFFFF, 0x04FE}; +static unsigned short unicode_title_data18[] = { + 0xFFFF, 0x0500, 0xFFFF, 0x0502, 0xFFFF, 0x0504, 0xFFFF, 0x0506, 0xFFFF, 0x0508, 0xFFFF, + 0x050A, 0xFFFF, 0x050C, 0xFFFF, 0x050E, 0xFFFF, 0x0510, 0xFFFF, 0x0512, 0xFFFF, 0x0514, + 0xFFFF, 0x0516, 0xFFFF, 0x0518, 0xFFFF, 0x051A, 0xFFFF, 0x051C, 0xFFFF, 0x051E, 0xFFFF, + 0x0520, 0xFFFF, 0x0522, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data19[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538, 0x0539, 0x053A, 0x053B, + 0x053C, 0x053D, 0x053E, 0x053F, 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, + 0x0547, 0x0548, 0x0549, 0x054A, 0x054B, 0x054C, 0x054D, 0x054E, 0x054F}; +static unsigned short unicode_title_data20[] = { + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data21[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xA77D, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C63, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data22[] = { + 0xFFFF, 0x1E00, 0xFFFF, 0x1E02, 0xFFFF, 0x1E04, 0xFFFF, 0x1E06, 0xFFFF, 0x1E08, 0xFFFF, + 0x1E0A, 0xFFFF, 0x1E0C, 0xFFFF, 0x1E0E, 0xFFFF, 0x1E10, 0xFFFF, 0x1E12, 0xFFFF, 0x1E14, + 0xFFFF, 0x1E16, 0xFFFF, 0x1E18, 0xFFFF, 0x1E1A, 0xFFFF, 0x1E1C, 0xFFFF, 0x1E1E, 0xFFFF, + 0x1E20, 0xFFFF, 0x1E22, 0xFFFF, 0x1E24, 0xFFFF, 0x1E26, 0xFFFF, 0x1E28, 0xFFFF, 0x1E2A, + 0xFFFF, 0x1E2C, 0xFFFF, 0x1E2E, 0xFFFF, 0x1E30, 0xFFFF, 0x1E32, 0xFFFF, 0x1E34, 0xFFFF, + 0x1E36, 0xFFFF, 0x1E38, 0xFFFF, 0x1E3A, 0xFFFF, 0x1E3C, 0xFFFF, 0x1E3E}; +static unsigned short unicode_title_data23[] = { + 0xFFFF, 0x1E40, 0xFFFF, 0x1E42, 0xFFFF, 0x1E44, 0xFFFF, 0x1E46, 0xFFFF, 0x1E48, 0xFFFF, + 0x1E4A, 0xFFFF, 0x1E4C, 0xFFFF, 0x1E4E, 0xFFFF, 0x1E50, 0xFFFF, 0x1E52, 0xFFFF, 0x1E54, + 0xFFFF, 0x1E56, 0xFFFF, 0x1E58, 0xFFFF, 0x1E5A, 0xFFFF, 0x1E5C, 0xFFFF, 0x1E5E, 0xFFFF, + 0x1E60, 0xFFFF, 0x1E62, 0xFFFF, 0x1E64, 0xFFFF, 0x1E66, 0xFFFF, 0x1E68, 0xFFFF, 0x1E6A, + 0xFFFF, 0x1E6C, 0xFFFF, 0x1E6E, 0xFFFF, 0x1E70, 0xFFFF, 0x1E72, 0xFFFF, 0x1E74, 0xFFFF, + 0x1E76, 0xFFFF, 0x1E78, 0xFFFF, 0x1E7A, 0xFFFF, 0x1E7C, 0xFFFF, 0x1E7E}; +static unsigned short unicode_title_data24[] = { + 0xFFFF, 0x1E80, 0xFFFF, 0x1E82, 0xFFFF, 0x1E84, 0xFFFF, 0x1E86, 0xFFFF, 0x1E88, 0xFFFF, + 0x1E8A, 0xFFFF, 0x1E8C, 0xFFFF, 0x1E8E, 0xFFFF, 0x1E90, 0xFFFF, 0x1E92, 0xFFFF, 0x1E94, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1E60, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x1EA0, 0xFFFF, 0x1EA2, 0xFFFF, 0x1EA4, 0xFFFF, 0x1EA6, 0xFFFF, 0x1EA8, 0xFFFF, 0x1EAA, + 0xFFFF, 0x1EAC, 0xFFFF, 0x1EAE, 0xFFFF, 0x1EB0, 0xFFFF, 0x1EB2, 0xFFFF, 0x1EB4, 0xFFFF, + 0x1EB6, 0xFFFF, 0x1EB8, 0xFFFF, 0x1EBA, 0xFFFF, 0x1EBC, 0xFFFF, 0x1EBE}; +static unsigned short unicode_title_data25[] = { + 0xFFFF, 0x1EC0, 0xFFFF, 0x1EC2, 0xFFFF, 0x1EC4, 0xFFFF, 0x1EC6, 0xFFFF, 0x1EC8, 0xFFFF, + 0x1ECA, 0xFFFF, 0x1ECC, 0xFFFF, 0x1ECE, 0xFFFF, 0x1ED0, 0xFFFF, 0x1ED2, 0xFFFF, 0x1ED4, + 0xFFFF, 0x1ED6, 0xFFFF, 0x1ED8, 0xFFFF, 0x1EDA, 0xFFFF, 0x1EDC, 0xFFFF, 0x1EDE, 0xFFFF, + 0x1EE0, 0xFFFF, 0x1EE2, 0xFFFF, 0x1EE4, 0xFFFF, 0x1EE6, 0xFFFF, 0x1EE8, 0xFFFF, 0x1EEA, + 0xFFFF, 0x1EEC, 0xFFFF, 0x1EEE, 0xFFFF, 0x1EF0, 0xFFFF, 0x1EF2, 0xFFFF, 0x1EF4, 0xFFFF, + 0x1EF6, 0xFFFF, 0x1EF8, 0xFFFF, 0x1EFA, 0xFFFF, 0x1EFC, 0xFFFF, 0x1EFE}; +static unsigned short unicode_title_data26[] = { + 0x1F08, 0x1F09, 0x1F0A, 0x1F0B, 0x1F0C, 0x1F0D, 0x1F0E, 0x1F0F, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F18, 0x1F19, 0x1F1A, 0x1F1B, 0x1F1C, 0x1F1D, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F28, + 0x1F29, 0x1F2A, 0x1F2B, 0x1F2C, 0x1F2D, 0x1F2E, 0x1F2F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F38, 0x1F39, 0x1F3A, 0x1F3B, 0x1F3C, 0x1F3D, 0x1F3E, + 0x1F3F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data27[] = { + 0x1F48, 0x1F49, 0x1F4A, 0x1F4B, 0x1F4C, 0x1F4D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F59, 0xFFFF, 0x1F5B, 0xFFFF, 0x1F5D, + 0xFFFF, 0x1F5F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F68, + 0x1F69, 0x1F6A, 0x1F6B, 0x1F6C, 0x1F6D, 0x1F6E, 0x1F6F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FBA, 0x1FBB, 0x1FC8, 0x1FC9, 0x1FCA, 0x1FCB, 0x1FDA, + 0x1FDB, 0x1FF8, 0x1FF9, 0x1FEA, 0x1FEB, 0x1FFA, 0x1FFB, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data28[] = { + 0x1F88, 0x1F89, 0x1F8A, 0x1F8B, 0x1F8C, 0x1F8D, 0x1F8E, 0x1F8F, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1F98, 0x1F99, 0x1F9A, 0x1F9B, 0x1F9C, 0x1F9D, + 0x1F9E, 0x1F9F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FA8, + 0x1FA9, 0x1FAA, 0x1FAB, 0x1FAC, 0x1FAD, 0x1FAE, 0x1FAF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FB8, 0x1FB9, 0xFFFF, 0x1FBC, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0399, 0xFFFF}; +static unsigned short unicode_title_data29[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x1FCC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FD8, 0x1FD9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FE8, + 0x1FE9, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FEC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1FFC, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data30[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x2132, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, + 0x2167, 0x2168, 0x2169, 0x216A, 0x216B, 0x216C, 0x216D, 0x216E, 0x216F}; +static unsigned short unicode_title_data31[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2183, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data32[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x24B6, 0x24B7, 0x24B8, 0x24B9, 0x24BA, 0x24BB, + 0x24BC, 0x24BD, 0x24BE, 0x24BF, 0x24C0, 0x24C1, 0x24C2, 0x24C3, 0x24C4, 0x24C5, 0x24C6, + 0x24C7, 0x24C8, 0x24C9, 0x24CA, 0x24CB, 0x24CC, 0x24CD, 0x24CE, 0x24CF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data33[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C00, 0x2C01, 0x2C02, 0x2C03, 0x2C04, 0x2C05, 0x2C06, + 0x2C07, 0x2C08, 0x2C09, 0x2C0A, 0x2C0B, 0x2C0C, 0x2C0D, 0x2C0E, 0x2C0F}; +static unsigned short unicode_title_data34[] = { + 0x2C10, 0x2C11, 0x2C12, 0x2C13, 0x2C14, 0x2C15, 0x2C16, 0x2C17, 0x2C18, 0x2C19, 0x2C1A, + 0x2C1B, 0x2C1C, 0x2C1D, 0x2C1E, 0x2C1F, 0x2C20, 0x2C21, 0x2C22, 0x2C23, 0x2C24, 0x2C25, + 0x2C26, 0x2C27, 0x2C28, 0x2C29, 0x2C2A, 0x2C2B, 0x2C2C, 0x2C2D, 0x2C2E, 0xFFFF, 0xFFFF, + 0x2C60, 0xFFFF, 0xFFFF, 0xFFFF, 0x023A, 0x023E, 0xFFFF, 0x2C67, 0xFFFF, 0x2C69, 0xFFFF, + 0x2C6B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C72, 0xFFFF, 0xFFFF, 0x2C75, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data35[] = { + 0xFFFF, 0x2C80, 0xFFFF, 0x2C82, 0xFFFF, 0x2C84, 0xFFFF, 0x2C86, 0xFFFF, 0x2C88, 0xFFFF, + 0x2C8A, 0xFFFF, 0x2C8C, 0xFFFF, 0x2C8E, 0xFFFF, 0x2C90, 0xFFFF, 0x2C92, 0xFFFF, 0x2C94, + 0xFFFF, 0x2C96, 0xFFFF, 0x2C98, 0xFFFF, 0x2C9A, 0xFFFF, 0x2C9C, 0xFFFF, 0x2C9E, 0xFFFF, + 0x2CA0, 0xFFFF, 0x2CA2, 0xFFFF, 0x2CA4, 0xFFFF, 0x2CA6, 0xFFFF, 0x2CA8, 0xFFFF, 0x2CAA, + 0xFFFF, 0x2CAC, 0xFFFF, 0x2CAE, 0xFFFF, 0x2CB0, 0xFFFF, 0x2CB2, 0xFFFF, 0x2CB4, 0xFFFF, + 0x2CB6, 0xFFFF, 0x2CB8, 0xFFFF, 0x2CBA, 0xFFFF, 0x2CBC, 0xFFFF, 0x2CBE}; +static unsigned short unicode_title_data36[] = { + 0xFFFF, 0x2CC0, 0xFFFF, 0x2CC2, 0xFFFF, 0x2CC4, 0xFFFF, 0x2CC6, 0xFFFF, 0x2CC8, 0xFFFF, + 0x2CCA, 0xFFFF, 0x2CCC, 0xFFFF, 0x2CCE, 0xFFFF, 0x2CD0, 0xFFFF, 0x2CD2, 0xFFFF, 0x2CD4, + 0xFFFF, 0x2CD6, 0xFFFF, 0x2CD8, 0xFFFF, 0x2CDA, 0xFFFF, 0x2CDC, 0xFFFF, 0x2CDE, 0xFFFF, + 0x2CE0, 0xFFFF, 0x2CE2, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data37[] = { + 0x10A0, 0x10A1, 0x10A2, 0x10A3, 0x10A4, 0x10A5, 0x10A6, 0x10A7, 0x10A8, 0x10A9, 0x10AA, + 0x10AB, 0x10AC, 0x10AD, 0x10AE, 0x10AF, 0x10B0, 0x10B1, 0x10B2, 0x10B3, 0x10B4, 0x10B5, + 0x10B6, 0x10B7, 0x10B8, 0x10B9, 0x10BA, 0x10BB, 0x10BC, 0x10BD, 0x10BE, 0x10BF, 0x10C0, + 0x10C1, 0x10C2, 0x10C3, 0x10C4, 0x10C5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data38[] = { + 0xFFFF, 0xA640, 0xFFFF, 0xA642, 0xFFFF, 0xA644, 0xFFFF, 0xA646, 0xFFFF, 0xA648, 0xFFFF, + 0xA64A, 0xFFFF, 0xA64C, 0xFFFF, 0xA64E, 0xFFFF, 0xA650, 0xFFFF, 0xA652, 0xFFFF, 0xA654, + 0xFFFF, 0xA656, 0xFFFF, 0xA658, 0xFFFF, 0xA65A, 0xFFFF, 0xA65C, 0xFFFF, 0xA65E, 0xFFFF, + 0xFFFF, 0xFFFF, 0xA662, 0xFFFF, 0xA664, 0xFFFF, 0xA666, 0xFFFF, 0xA668, 0xFFFF, 0xA66A, + 0xFFFF, 0xA66C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data39[] = { + 0xFFFF, 0xA680, 0xFFFF, 0xA682, 0xFFFF, 0xA684, 0xFFFF, 0xA686, 0xFFFF, 0xA688, 0xFFFF, + 0xA68A, 0xFFFF, 0xA68C, 0xFFFF, 0xA68E, 0xFFFF, 0xA690, 0xFFFF, 0xA692, 0xFFFF, 0xA694, + 0xFFFF, 0xA696, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data40[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xA722, 0xFFFF, 0xA724, 0xFFFF, 0xA726, 0xFFFF, 0xA728, 0xFFFF, 0xA72A, + 0xFFFF, 0xA72C, 0xFFFF, 0xA72E, 0xFFFF, 0xFFFF, 0xFFFF, 0xA732, 0xFFFF, 0xA734, 0xFFFF, + 0xA736, 0xFFFF, 0xA738, 0xFFFF, 0xA73A, 0xFFFF, 0xA73C, 0xFFFF, 0xA73E}; +static unsigned short unicode_title_data41[] = { + 0xFFFF, 0xA740, 0xFFFF, 0xA742, 0xFFFF, 0xA744, 0xFFFF, 0xA746, 0xFFFF, 0xA748, 0xFFFF, + 0xA74A, 0xFFFF, 0xA74C, 0xFFFF, 0xA74E, 0xFFFF, 0xA750, 0xFFFF, 0xA752, 0xFFFF, 0xA754, + 0xFFFF, 0xA756, 0xFFFF, 0xA758, 0xFFFF, 0xA75A, 0xFFFF, 0xA75C, 0xFFFF, 0xA75E, 0xFFFF, + 0xA760, 0xFFFF, 0xA762, 0xFFFF, 0xA764, 0xFFFF, 0xA766, 0xFFFF, 0xA768, 0xFFFF, 0xA76A, + 0xFFFF, 0xA76C, 0xFFFF, 0xA76E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xA779, 0xFFFF, 0xA77B, 0xFFFF, 0xFFFF, 0xA77E}; +static unsigned short unicode_title_data42[] = { + 0xFFFF, 0xA780, 0xFFFF, 0xA782, 0xFFFF, 0xA784, 0xFFFF, 0xA786, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xA78B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_title_data43[] = { + 0xFFFF, 0xFF21, 0xFF22, 0xFF23, 0xFF24, 0xFF25, 0xFF26, 0xFF27, 0xFF28, 0xFF29, 0xFF2A, + 0xFF2B, 0xFF2C, 0xFF2D, 0xFF2E, 0xFF2F, 0xFF30, 0xFF31, 0xFF32, 0xFF33, 0xFF34, 0xFF35, + 0xFF36, 0xFF37, 0xFF38, 0xFF39, 0xFF3A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; + +static unsigned short* unicode_title_data_table[UNICODE_TITLE_BLOCK_COUNT] = { + unicode_title_data0, unicode_title_data1, unicode_title_data2, unicode_title_data3, + unicode_title_data4, unicode_title_data5, unicode_title_data6, unicode_title_data7, + unicode_title_data8, unicode_title_data9, unicode_title_data10, unicode_title_data11, + unicode_title_data12, unicode_title_data13, unicode_title_data14, unicode_title_data15, + unicode_title_data16, unicode_title_data17, unicode_title_data18, unicode_title_data19, + unicode_title_data20, unicode_title_data21, unicode_title_data22, unicode_title_data23, + unicode_title_data24, unicode_title_data25, unicode_title_data26, unicode_title_data27, + unicode_title_data28, unicode_title_data29, unicode_title_data30, unicode_title_data31, + unicode_title_data32, unicode_title_data33, unicode_title_data34, unicode_title_data35, + unicode_title_data36, unicode_title_data37, unicode_title_data38, unicode_title_data39, + unicode_title_data40, unicode_title_data41, unicode_title_data42, unicode_title_data43}; +/* Generated by builder. Do not modify. End unicode_title_tables */ + +SQLITE_EXPORT u16 sqlite3_unicode_title(u16 c) { + u16 index = unicode_title_indexes[(c) >> UNICODE_TITLE_BLOCK_SHIFT]; + u8 position = (c)&UNICODE_TITLE_BLOCK_MASK; + u16(p) = (unicode_title_data_table[index][unicode_title_positions[index][position]]); + int l = unicode_title_positions[index][position + 1] - unicode_title_positions[index][position]; + + return ((l == 1) && ((p) == 0xFFFF)) ? c : p; +} +#endif + +#ifdef SQLITE3_UNICODE_UNACC +/* Generated by builder. Do not modify. Start unicode_unacc_defines */ +#define UNICODE_UNACC_BLOCK_SHIFT 5 +#define UNICODE_UNACC_BLOCK_MASK ((1 << UNICODE_UNACC_BLOCK_SHIFT) - 1) +#define UNICODE_UNACC_BLOCK_SIZE (1 << UNICODE_UNACC_BLOCK_SHIFT) +#define UNICODE_UNACC_BLOCK_COUNT 239 +#define UNICODE_UNACC_INDEXES_SIZE (0x10000 >> UNICODE_UNACC_BLOCK_SHIFT) +/* Generated by builder. Do not modify. End unicode_unacc_defines */ + +/* Generated by builder. Do not modify. Start unicode_unacc_tables */ + +static unsigned short unicode_unacc_indexes[UNICODE_UNACC_INDEXES_SIZE] = { + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 0, 0, 35, 0, 0, 0, 0, 36, 0, 37, 38, 39, 40, 41, 0, + 0, 42, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 44, 45, 0, + 0, 0, 46, 47, 0, 48, 49, 0, 0, 0, 0, 0, 0, 0, 50, 0, 51, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, + 0, 0, 0, 53, 54, 0, 55, 0, 56, 57, 0, 0, 0, 0, 0, 58, 0, 0, 0, + 0, 0, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 60, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 61, 62, 63, 64, 65, 66, 0, 0, 67, 68, 69, 70, 71, 72, 73, + 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 0, 0, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 0, 103, 0, 104, 0, 105, 106, + 0, 107, 108, 0, 0, 0, 109, 110, 111, 112, 113, 0, 0, 0, 0, 0, 114, 0, 115, + 116, 0, 0, 0, 117, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118, 119, 0, 0, 0, + 0, 0, 0, 0, 0, 120, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, + 133, 134, 135, 136, 0, 0, 0, 0, 0, 0, 0, 137, 138, 139, 0, 0, 0, 0, 0, + 0, 0, 140, 0, 0, 0, 0, 141, 0, 0, 0, 142, 0, 0, 143, 144, 145, 146, 147, + 148, 149, 150, 0, 151, 152, 153, 154, 155, 156, 157, 158, 0, 159, 160, 161, 162, 0, 0, + 0, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 179, 0, 180, 0, 0, 0, 0, 181, 182, 183, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 184, 185, 186, + 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 0, 199, 200, 201, 202, 203, 204, + 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238}; + +static unsigned char + unicode_unacc_positions[UNICODE_UNACC_BLOCK_COUNT][UNICODE_UNACC_BLOCK_SIZE + 1] = { + /* 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 1 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 34, 37, 38}, + /* 2 */ {0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 3 */ {0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 4 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 5 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35}, + /* 6 */ {0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36}, + /* 7 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 8 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 9 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 10 */ {0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}, + /* 11 */ {0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 21, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 37, 38, 39}, + /* 12 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 13 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 14 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 15 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 16 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 17 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 18 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 19 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 20 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 21 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 22 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 23 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 24 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 25 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 26 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 27 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 28 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 29 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 30 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 32 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 33 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 34 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 35 */ {0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 36 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 37 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 23, 25, 27, 29, 30, 31, 32, 33, 34, 35, 36}, + /* 38 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 39 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 40 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 41 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 42 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 43 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 44 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 45 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 46 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 47 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 48 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 49 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 50 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 51 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 52 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 53 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 54 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 32, 33, 34}, + /* 55 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 56 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 57 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 58 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 59 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 60 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 61 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 62 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 63 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 64 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 65 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 66 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 67 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 68 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 69 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 70 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 71 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33}, + /* 72 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 73 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 74 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 75 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 76 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 77 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 78 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 79 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 80 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 81 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 82 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 83 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 84 */ {0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 24, 27, 28, 30, 33, 34, 35, 36, 37, 39, 40, 41, 42}, + /* 85 */ {0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38}, + /* 86 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 87 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 88 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 89 */ {0, 3, 6, 7, 9, 10, 13, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43}, + /* 90 */ {0, 2, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 36, 37, 38}, + /* 91 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 57}, + /* 92 */ {0, 1, 3, 6, 8, 9, 11, 14, 18, 20, 21, 23, 26, 27, 28, 29, 30, + 31, 33, 36, 38, 39, 41, 44, 48, 50, 51, 53, 56, 57, 58, 59, 60}, + /* 93 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 94 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 95 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 96 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 97 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 98 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38}, + /* 99 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 100 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 101 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 102 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 103 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 104 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 105 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 106 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 107 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 108 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 109 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 15, 17, 19, 21, 23, + 25, 27, 29, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 62, 66, 70}, + /* 110 */ {0, 4, 8, 12, 16, 20, 24, 28, 32, 34, 36, 38, 40, 42, 44, 46, 48, + 50, 53, 56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95}, + /* 111 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, + 51, 54, 57, 60, 63, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76}, + /* 112 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 113 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 114 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 115 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 116 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 117 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 118 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 119 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 120 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 121 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 122 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 123 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 124 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 125 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 126 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 127 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}, + /* 128 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 129 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 130 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 23, 25, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37}, + /* 131 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 132 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 133 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 134 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 135 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 136 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 137 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 138 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 139 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 140 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 141 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 142 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 143 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 144 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 145 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 146 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 147 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 148 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 149 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 150 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 151 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 152 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 153 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 154 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 155 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33}, + /* 156 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 157 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 158 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33}, + /* 159 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 160 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 161 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 162 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 163 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 46, 50, + 54, 58, 62, 66, 70, 74, 78, 82, 86, 90, 94, 98, 102, 109, 115, 116}, + /* 164 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, + 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96}, + /* 165 */ {0, 3, 6, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57}, + /* 166 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, + 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 47, 51, 53, 54}, + /* 167 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 168 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47}, + /* 169 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 21, 24, 27, 29, 32, 34, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53}, + /* 170 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 171 */ {0, 4, 8, 12, 15, 19, 22, 25, 30, 34, 37, 40, 43, 47, 51, 54, 57, + 59, 62, 66, 70, 72, 77, 83, 88, 91, 96, 101, 105, 108, 111, 114, 118}, + /* 172 */ {0, 5, 9, 12, 15, 18, 20, 22, 24, 26, 29, 32, 37, 40, 44, 49, 52, + 54, 56, 61, 65, 70, 73, 78, 80, 83, 86, 89, 92, 95, 99, 102, 104}, + /* 173 */ {0, 3, 6, 9, 13, 16, 19, 22, 27, 31, 33, 38, 40, 44, 48, 51, 54, + 57, 61, 63, 66, 70, 72, 77, 80, 82, 84, 86, 88, 90, 92, 94, 96}, + /* 174 */ {0, 2, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, + 49, 52, 54, 56, 59, 61, 63, 65, 68, 71, 73, 75, 77, 79, 81, 85}, + /* 175 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 19, 23, 25, 27, 29, 31, 33, 35, + 37, 40, 43, 46, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 72}, + /* 176 */ {0, 3, 5, 8, 11, 14, 16, 19, 22, 26, 28, 31, 34, 37, 40, 45, 51, + 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83}, + /* 177 */ {0, 2, 4, 8, 10, 12, 14, 18, 21, 23, 25, 27, 29, 31, 33, 35, 37, + 39, 41, 44, 46, 48, 51, 54, 56, 60, 63, 65, 67, 69, 71, 74, 77}, + /* 178 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 21, 24, 27, 30, 33, 36, 39, + 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87}, + /* 179 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 180 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 181 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 182 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 183 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 184 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 185 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 186 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 187 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 188 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 189 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 190 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 191 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 192 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 193 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 194 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 195 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 196 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 197 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 198 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 199 */ {0, 2, 4, 6, 9, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 30, 32, 34, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46}, + /* 200 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 201 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 202 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 203 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 204 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 205 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33}, + /* 206 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, + 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 47, 48, 49, 50}, + /* 207 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64}, + /* 208 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64}, + /* 209 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 55, 56, 57, 58, 59}, + /* 210 */ {0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60}, + /* 211 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}, + /* 212 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64}, + /* 213 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 51, 53, 55, 57, 59, 61, 63}, + /* 214 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 37, 38, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61}, + /* 215 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64}, + /* 216 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 57, 58, 59, 60}, + /* 217 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64}, + /* 218 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, + 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96}, + /* 219 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, + 49, 50, 53, 56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92}, + /* 220 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, + 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96}, + /* 221 */ {0, 3, 6, 9, 12, 15, 18, 21, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48}, + /* 222 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 19, 22, 26, 30, 34, 38, 42, 46, 50, 53, 71, 79, 83, 84, 85, 86}, + /* 223 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 29, 30, 31, 32, 33, 34}, + /* 224 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}, + /* 225 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 226 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 227 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 228 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 229 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 230 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 38, 39, 40}, + /* 231 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 232 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 233 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 234 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 235 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 236 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 237 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + /* 238 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}}; + +static unsigned short unicode_unacc_data0[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data1[] = { + 0x0020, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0xFFFF, + 0x0061, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0xFFFF, 0xFFFF, 0x0032, 0x0033, + 0x0020, 0x03BC, 0xFFFF, 0xFFFF, 0x0020, 0x0031, 0x006F, 0xFFFF, 0x0031, 0x2044, + 0x0034, 0x0031, 0x2044, 0x0032, 0x0033, 0x2044, 0x0034, 0xFFFF}; +static unsigned short unicode_unacc_data2[] = { + 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0045, 0x0043, 0x0045, 0x0045, + 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049, 0xFFFF, 0x004E, 0x004F, 0x004F, 0x004F, + 0x004F, 0x004F, 0xFFFF, 0x004F, 0x0055, 0x0055, 0x0055, 0x0055, 0x0059, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data3[] = { + 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0065, 0x0063, 0x0065, 0x0065, + 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, 0xFFFF, 0x006E, 0x006F, 0x006F, 0x006F, + 0x006F, 0x006F, 0xFFFF, 0x006F, 0x0075, 0x0075, 0x0075, 0x0075, 0x0079, 0xFFFF, 0x0079}; +static unsigned short unicode_unacc_data4[] = { + 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0043, 0x0063, 0x0043, 0x0063, 0x0043, + 0x0063, 0x0043, 0x0063, 0x0044, 0x0064, 0x0044, 0x0064, 0x0045, 0x0065, 0x0045, 0x0065, + 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067}; +static unsigned short unicode_unacc_data5[] = { + 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0048, 0x0068, 0x0049, 0x0069, 0x0049, 0x0069, + 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0xFFFF, 0x0049, 0x004A, 0x0069, 0x006A, 0x004A, 0x006A, + 0x004B, 0x006B, 0xFFFF, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x00B7}; +static unsigned short unicode_unacc_data6[] = { + 0x006C, 0x00B7, 0x004C, 0x006C, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, 0x006E, 0x02BC, 0x006E, + 0xFFFF, 0xFFFF, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x0045, 0x006F, 0x0065, + 0x0052, 0x0072, 0x0052, 0x0072, 0x0052, 0x0072, 0x0053, 0x0073, 0x0053, 0x0073, 0x0053, 0x0073}; +static unsigned short unicode_unacc_data7[] = { + 0x0053, 0x0073, 0x0054, 0x0074, 0x0054, 0x0074, 0x0054, 0x0074, 0x0055, 0x0075, 0x0055, + 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0057, 0x0077, + 0x0059, 0x0079, 0x0059, 0x005A, 0x007A, 0x005A, 0x007A, 0x005A, 0x007A, 0x0073}; +static unsigned short unicode_unacc_data8[] = { + 0x0062, 0x0042, 0x0042, 0x0062, 0xFFFF, 0xFFFF, 0xFFFF, 0x0043, 0x0063, 0xFFFF, 0x0044, + 0x0044, 0x0064, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0046, 0x0066, 0x0047, 0xFFFF, 0xFFFF, + 0xFFFF, 0x0049, 0x004B, 0x006B, 0x006C, 0xFFFF, 0xFFFF, 0x004E, 0x006E, 0x004F}; +static unsigned short unicode_unacc_data9[] = { + 0x004F, 0x006F, 0xFFFF, 0xFFFF, 0x0050, 0x0070, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0074, 0x0054, 0x0074, 0x0054, 0x0055, 0x0075, 0xFFFF, 0x0056, 0x0059, 0x0079, 0x005A, + 0x007A, 0xFFFF, 0xFFFF, 0xFFFF, 0x0292, 0xFFFF, 0xFFFF, 0xFFFF, 0x0296, 0xFFFF}; +static unsigned short unicode_unacc_data10[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0044, 0x005A, 0x0044, 0x007A, 0x0064, 0x007A, 0x004C, + 0x004A, 0x004C, 0x006A, 0x006C, 0x006A, 0x004E, 0x004A, 0x004E, 0x006A, 0x006E, 0x006A, + 0x0041, 0x0061, 0x0049, 0x0069, 0x004F, 0x006F, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, + 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0xFFFF, 0x0041, 0x0061}; +static unsigned short unicode_unacc_data11[] = { + 0x0041, 0x0061, 0x0041, 0x0045, 0x0061, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067, + 0x004B, 0x006B, 0x004F, 0x006F, 0x004F, 0x006F, 0x01B7, 0x0292, 0x006A, 0x0044, + 0x005A, 0x0044, 0x007A, 0x0064, 0x007A, 0x0047, 0x0067, 0xFFFF, 0xFFFF, 0x004E, + 0x006E, 0x0041, 0x0061, 0x0041, 0x0045, 0x0061, 0x0065, 0x004F, 0x006F}; +static unsigned short unicode_unacc_data12[] = { + 0x0041, 0x0061, 0x0041, 0x0061, 0x0045, 0x0065, 0x0045, 0x0065, 0x0049, 0x0069, 0x0049, + 0x0069, 0x004F, 0x006F, 0x004F, 0x006F, 0x0052, 0x0072, 0x0052, 0x0072, 0x0055, 0x0075, + 0x0055, 0x0075, 0x0053, 0x0073, 0x0054, 0x0074, 0xFFFF, 0xFFFF, 0x0048, 0x0068}; +static unsigned short unicode_unacc_data13[] = { + 0x004E, 0x0064, 0xFFFF, 0xFFFF, 0x005A, 0x007A, 0x0041, 0x0061, 0x0045, 0x0065, 0x004F, + 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x0059, 0x0079, 0x006C, 0x006E, + 0x0074, 0xFFFF, 0xFFFF, 0xFFFF, 0x0041, 0x0043, 0x0063, 0x004C, 0x0054, 0x0073}; +static unsigned short unicode_unacc_data14[] = { + 0x007A, 0xFFFF, 0xFFFF, 0x0042, 0xFFFF, 0xFFFF, 0x0045, 0x0065, 0x004A, 0x006A, 0xFFFF, + 0x0071, 0x0052, 0x0072, 0x0059, 0x0079, 0xFFFF, 0xFFFF, 0xFFFF, 0x0062, 0xFFFF, 0x0063, + 0x0064, 0x0064, 0xFFFF, 0xFFFF, 0x0259, 0xFFFF, 0xFFFF, 0x025C, 0xFFFF, 0x0237}; +static unsigned short unicode_unacc_data15[] = { + 0x0067, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0068, 0xA727, 0x0069, 0xFFFF, 0xFFFF, + 0x006C, 0x006C, 0x006C, 0xFFFF, 0xFFFF, 0x026F, 0x006D, 0x006E, 0x006E, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0279, 0x0279, 0x0072, 0x0072, 0x0072, 0xFFFF}; +static unsigned short unicode_unacc_data16[] = { + 0xFFFF, 0xFFFF, 0x0073, 0xFFFF, 0x0237, 0xFFFF, 0x0283, 0xFFFF, 0x0074, 0xFFFF, 0xFFFF, + 0x0076, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x007A, 0x007A, 0xFFFF, 0x0292, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0262, 0xFFFF, 0x006A, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data17[] = { + 0x0071, 0x0294, 0xFFFF, 0xFFFF, 0xFFFF, 0x02A3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0265, 0x0265, 0x0068, 0x0068, 0x006A, 0x0072, 0x0279, 0x0279, + 0x0281, 0x0077, 0x0079, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data18[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data19[] = { + 0x0263, 0x006C, 0x0073, 0x0078, 0x0295, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data20[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x02B9, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0xFFFF, 0xFFFF, 0xFFFF, 0x003B, 0xFFFF}; +static unsigned short unicode_unacc_data21[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0x0020, 0x0391, 0x00B7, 0x0395, 0x0397, 0x0399, + 0xFFFF, 0x039F, 0xFFFF, 0x03A5, 0x03A9, 0x03B9, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data22[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0399, + 0x03A5, 0x03B1, 0x03B5, 0x03B7, 0x03B9, 0x03C5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data23[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03B9, + 0x03C5, 0x03BF, 0x03C5, 0x03C9, 0xFFFF, 0x03B2, 0x03B8, 0x03A5, 0x03A5, 0x03A5, 0x03C6, + 0x03C0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data24[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x03BA, 0x03C1, 0x03C2, 0xFFFF, 0x0398, 0x03B5, + 0xFFFF, 0xFFFF, 0xFFFF, 0x03A3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data25[] = { + 0x0415, 0x0415, 0xFFFF, 0x0413, 0xFFFF, 0xFFFF, 0xFFFF, 0x0406, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x041A, 0x0418, 0x0423, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0418, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data26[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0438, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data27[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0435, 0x0435, 0xFFFF, 0x0433, 0xFFFF, 0xFFFF, + 0xFFFF, 0x0456, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x043A, 0x0438, 0x0443, 0xFFFF}; +static unsigned short unicode_unacc_data28[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0474, 0x0475, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0460, 0x0461, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data29[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0418, + 0x0438, 0xFFFF, 0xFFFF, 0x0420, 0x0440, 0x0413, 0x0433, 0x0413, 0x0433, 0x0413, 0x0433, + 0x0416, 0x0436, 0x0417, 0x0437, 0x041A, 0x043A, 0x041A, 0x043A, 0x041A, 0x043A}; +static unsigned short unicode_unacc_data30[] = { + 0xFFFF, 0xFFFF, 0x041D, 0x043D, 0xFFFF, 0xFFFF, 0x041F, 0x043F, 0xFFFF, 0xFFFF, 0x0421, + 0x0441, 0x0422, 0x0442, 0xFFFF, 0xFFFF, 0x04AE, 0x04AF, 0x0425, 0x0445, 0xFFFF, 0xFFFF, + 0x0427, 0x0447, 0x0427, 0x0447, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x04BC, 0x04BD}; +static unsigned short unicode_unacc_data31[] = { + 0xFFFF, 0x0416, 0x0436, 0x041A, 0x043A, 0x041B, 0x043B, 0x041D, 0x043D, 0x041D, 0x043D, + 0xFFFF, 0xFFFF, 0x041C, 0x043C, 0xFFFF, 0x0410, 0x0430, 0x0410, 0x0430, 0xFFFF, 0xFFFF, + 0x0415, 0x0435, 0xFFFF, 0xFFFF, 0x04D8, 0x04D9, 0x0416, 0x0436, 0x0417, 0x0437}; +static unsigned short unicode_unacc_data32[] = { + 0xFFFF, 0xFFFF, 0x0418, 0x0438, 0x0418, 0x0438, 0x041E, 0x043E, 0xFFFF, 0xFFFF, 0x04E8, + 0x04E9, 0x042D, 0x044D, 0x0423, 0x0443, 0x0423, 0x0443, 0x0423, 0x0443, 0x0427, 0x0447, + 0x0413, 0x0433, 0x042B, 0x044B, 0x0413, 0x0433, 0x0425, 0x0445, 0x0425, 0x0445}; +static unsigned short unicode_unacc_data33[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x041B, 0x043B, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data34[] = { + 0x041B, 0x043B, 0x041D, 0x043D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data35[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0565, 0x0582, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data36[] = { + 0xFFFF, 0xFFFF, 0x0627, 0x0627, 0x0648, 0x0627, 0x064A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x06A9, 0x06A9, 0x06CC, 0x06CC, 0x06CC}; +static unsigned short unicode_unacc_data37[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0627, 0x0627, 0xFFFF, 0x0627, 0x0674, 0x0648, + 0x0674, 0x06C7, 0x0674, 0x064A, 0x0674, 0xFFFF, 0xFFFF, 0xFFFF, 0x062A, 0x062A, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data38[] = { + 0xFFFF, 0x062D, 0x062D, 0xFFFF, 0xFFFF, 0x062D, 0xFFFF, 0xFFFF, 0xFFFF, 0x062F, 0x062F, + 0x062F, 0xFFFF, 0xFFFF, 0xFFFF, 0x062F, 0x062F, 0xFFFF, 0x0631, 0x0631, 0x0631, 0x0631, + 0x0631, 0x0631, 0xFFFF, 0x0631, 0x0633, 0x0633, 0x0633, 0x0635, 0x0635, 0x0637}; +static unsigned short unicode_unacc_data39[] = { + 0x0639, 0xFFFF, 0x0641, 0x0641, 0xFFFF, 0x0641, 0xFFFF, 0x0642, 0x0642, 0xFFFF, 0xFFFF, + 0x0643, 0x0643, 0xFFFF, 0x0643, 0xFFFF, 0x06AF, 0xFFFF, 0x06AF, 0xFFFF, 0x06AF, 0x0644, + 0x0644, 0x0644, 0x0644, 0x0646, 0xFFFF, 0xFFFF, 0x0646, 0x0646, 0xFFFF, 0x0686}; +static unsigned short unicode_unacc_data40[] = { + 0x06D5, 0xFFFF, 0x06C1, 0xFFFF, 0x0648, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0648, + 0xFFFF, 0xFFFF, 0x064A, 0x064A, 0x0648, 0xFFFF, 0x064A, 0xFFFF, 0x06D2, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data41[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x062F, 0x0631, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0634, 0x0636, 0x063A, 0xFFFF, 0xFFFF, 0x0647}; +static unsigned short unicode_unacc_data42[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0628, 0x0628, 0x0628, 0x0628, 0x0628, 0x0628, + 0x0628, 0x062D, 0x062D, 0x062F, 0x062F, 0x0631, 0x0633, 0x0639, 0x0639, 0x0639}; +static unsigned short unicode_unacc_data43[] = { + 0x0641, 0x0641, 0x06A9, 0x06A9, 0x06A9, 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, 0x0644, + 0x0631, 0x0631, 0x0633, 0x062D, 0x062D, 0x0633, 0x0631, 0x062D, 0x0627, 0x0627, 0x06CC, + 0x06CC, 0x06CC, 0x0648, 0x0648, 0x06D2, 0x06D2, 0x062D, 0x0633, 0x0633, 0x0643}; +static unsigned short unicode_unacc_data44[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0928, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0930, 0xFFFF, 0xFFFF, 0x0933, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data45[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x0915, 0x0916, 0x0917, 0x091C, 0x0921, 0x0922, 0x092B, 0x092F}; +static unsigned short unicode_unacc_data46[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x09A1, 0x09A2, 0xFFFF, 0x09AF}; +static unsigned short unicode_unacc_data47[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x09B0, 0x09B0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data48[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0A32, 0xFFFF, 0xFFFF, + 0x0A38, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data49[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0A16, 0x0A17, 0x0A1C, 0xFFFF, 0xFFFF, 0x0A2B, 0xFFFF}; +static unsigned short unicode_unacc_data50[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0B21, 0x0B22, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data51[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0B92, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data52[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0E32, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data53[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0EB2, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data54[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0EAB, 0x0E99, 0x0EAB, 0x0EA1, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data55[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x0F0B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data56[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x0F42, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x0F4C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0F51, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x0F56, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0F5B, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data57[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0F40, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data58[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1025, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data59[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x10DC, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data60[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1B05, 0xFFFF, 0x1B07, 0xFFFF, 0x1B09, + 0xFFFF, 0x1B0B, 0xFFFF, 0x1B0D, 0xFFFF, 0xFFFF, 0xFFFF, 0x1B11, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data61[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x029F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1D11, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data62[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x0041, 0x0041, 0x0045, 0x0042, 0xFFFF, 0x0044, 0x0045, 0x018E, 0x0047, 0x0048, + 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0xFFFF, 0x004F, 0x0222, 0x0050, 0x0052}; +static unsigned short unicode_unacc_data63[] = { + 0x0054, 0x0055, 0x0057, 0x0061, 0x0250, 0x0251, 0x1D02, 0x0062, 0x0064, 0x0065, 0x0259, + 0x025B, 0x025C, 0x0067, 0xFFFF, 0x006B, 0x006D, 0x014B, 0x006F, 0x0254, 0x1D16, 0x1D17, + 0x0070, 0x0074, 0x0075, 0x1D1D, 0x026F, 0x0076, 0x1D25, 0x03B2, 0x03B3, 0x03B4}; +static unsigned short unicode_unacc_data64[] = { + 0x03C6, 0x03C7, 0x0069, 0x0072, 0x0075, 0x0076, 0x03B2, 0x03B3, 0x03C1, 0x03C6, 0x03C7, + 0xFFFF, 0x0062, 0x0064, 0x0066, 0x006D, 0x006E, 0x0070, 0x0072, 0x0072, 0x0073, 0x0074, + 0x007A, 0xFFFF, 0x043D, 0xFFFF, 0xFFFF, 0xFFFF, 0x0269, 0x0070, 0xFFFF, 0x028A}; +static unsigned short unicode_unacc_data65[] = { + 0x0062, 0x0064, 0x0066, 0x0067, 0x006B, 0x006C, 0x006D, 0x006E, 0x0070, 0x0072, 0x0073, + 0x0283, 0x0076, 0x0078, 0x007A, 0x0061, 0x0251, 0x0064, 0x0065, 0x025B, 0x025C, 0x0259, + 0x0069, 0x0254, 0x0283, 0x0075, 0x0292, 0x0252, 0x0063, 0x0063, 0x00F0, 0x025C}; +static unsigned short unicode_unacc_data66[] = { + 0x0066, 0x0237, 0x0261, 0x0265, 0x0069, 0x0269, 0x026A, 0x1D7B, 0x006A, 0x006C, 0x006C, + 0x029F, 0x006D, 0x026F, 0x006E, 0x006E, 0x0274, 0x0275, 0x0278, 0x0073, 0x0283, 0x0074, + 0x0289, 0x028A, 0x1D1C, 0x0076, 0x028C, 0x007A, 0x007A, 0x007A, 0x0292, 0x03B8}; +static unsigned short unicode_unacc_data67[] = { + 0x0041, 0x0061, 0x0042, 0x0062, 0x0042, 0x0062, 0x0042, 0x0062, 0x0043, 0x0063, 0x0044, + 0x0064, 0x0044, 0x0064, 0x0044, 0x0064, 0x0044, 0x0064, 0x0044, 0x0064, 0x0045, 0x0065, + 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, 0x0046, 0x0066}; +static unsigned short unicode_unacc_data68[] = { + 0x0047, 0x0067, 0x0048, 0x0068, 0x0048, 0x0068, 0x0048, 0x0068, 0x0048, 0x0068, 0x0048, + 0x0068, 0x0049, 0x0069, 0x0049, 0x0069, 0x004B, 0x006B, 0x004B, 0x006B, 0x004B, 0x006B, + 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004D, 0x006D}; +static unsigned short unicode_unacc_data69[] = { + 0x004D, 0x006D, 0x004D, 0x006D, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, + 0x006E, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x0050, 0x0070, + 0x0050, 0x0070, 0x0052, 0x0072, 0x0052, 0x0072, 0x0052, 0x0072, 0x0052, 0x0072}; +static unsigned short unicode_unacc_data70[] = { + 0x0053, 0x0073, 0x0053, 0x0073, 0x0053, 0x0073, 0x0053, 0x0073, 0x0053, 0x0073, 0x0054, + 0x0074, 0x0054, 0x0074, 0x0054, 0x0074, 0x0054, 0x0074, 0x0055, 0x0075, 0x0055, 0x0075, + 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0056, 0x0076, 0x0056, 0x0076}; +static unsigned short unicode_unacc_data71[] = { + 0x0057, 0x0077, 0x0057, 0x0077, 0x0057, 0x0077, 0x0057, 0x0077, 0x0057, 0x0077, 0x0058, + 0x0078, 0x0058, 0x0078, 0x0059, 0x0079, 0x005A, 0x007A, 0x005A, 0x007A, 0x005A, 0x007A, + 0x0068, 0x0074, 0x0077, 0x0079, 0x0061, 0x02BE, 0x0073, 0x0073, 0x0073, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data72[] = { + 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, + 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, + 0x0041, 0x0061, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065}; +static unsigned short unicode_unacc_data73[] = { + 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, 0x0049, 0x0069, 0x0049, + 0x0069, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, + 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F}; +static unsigned short unicode_unacc_data74[] = { + 0x004F, 0x006F, 0x004F, 0x006F, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, + 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0059, 0x0079, 0x0059, 0x0079, + 0x0059, 0x0079, 0x0059, 0x0079, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0059, 0x0079}; +static unsigned short unicode_unacc_data75[] = { + 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x0391, 0x0391, 0x0391, + 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x03B5, 0x03B5, 0x03B5, 0x03B5, 0x03B5, 0x03B5, + 0xFFFF, 0xFFFF, 0x0395, 0x0395, 0x0395, 0x0395, 0x0395, 0x0395, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data76[] = { + 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x0397, 0x0397, 0x0397, + 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x03B9, 0x03B9, 0x03B9, 0x03B9, 0x03B9, 0x03B9, + 0x03B9, 0x03B9, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399}; +static unsigned short unicode_unacc_data77[] = { + 0x03BF, 0x03BF, 0x03BF, 0x03BF, 0x03BF, 0x03BF, 0xFFFF, 0xFFFF, 0x039F, 0x039F, 0x039F, + 0x039F, 0x039F, 0x039F, 0xFFFF, 0xFFFF, 0x03C5, 0x03C5, 0x03C5, 0x03C5, 0x03C5, 0x03C5, + 0x03C5, 0x03C5, 0xFFFF, 0x03A5, 0xFFFF, 0x03A5, 0xFFFF, 0x03A5, 0xFFFF, 0x03A5}; +static unsigned short unicode_unacc_data78[] = { + 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03A9, 0x03A9, 0x03A9, + 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03B1, 0x03B1, 0x03B5, 0x03B5, 0x03B7, 0x03B7, + 0x03B9, 0x03B9, 0x03BF, 0x03BF, 0x03C5, 0x03C5, 0x03C9, 0x03C9, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data79[] = { + 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x0391, 0x0391, 0x0391, + 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x03B7, 0x03B7, + 0x03B7, 0x03B7, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397}; +static unsigned short unicode_unacc_data80[] = { + 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03C9, 0x03A9, 0x03A9, 0x03A9, + 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0x03B1, 0xFFFF, + 0x03B1, 0x03B1, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0020, 0x03B9, 0x0020}; +static unsigned short unicode_unacc_data81[] = { + 0x0020, 0x0020, 0x03B7, 0x03B7, 0x03B7, 0xFFFF, 0x03B7, 0x03B7, 0x0395, 0x0395, 0x0397, + 0x0397, 0x0397, 0x0020, 0x0020, 0x0020, 0x03B9, 0x03B9, 0x03B9, 0x03B9, 0xFFFF, 0xFFFF, + 0x03B9, 0x03B9, 0x0399, 0x0399, 0x0399, 0x0399, 0xFFFF, 0x0020, 0x0020, 0x0020}; +static unsigned short unicode_unacc_data82[] = { + 0x03C5, 0x03C5, 0x03C5, 0x03C5, 0x03C1, 0x03C1, 0x03C5, 0x03C5, 0x03A5, 0x03A5, 0x03A5, + 0x03A5, 0x03A1, 0x0020, 0x0020, 0x0060, 0xFFFF, 0xFFFF, 0x03C9, 0x03C9, 0x03C9, 0xFFFF, + 0x03C9, 0x03C9, 0x039F, 0x039F, 0x03A9, 0x03A9, 0x03A9, 0x0020, 0x0020, 0xFFFF}; +static unsigned short unicode_unacc_data83[] = { + 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2010, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x0020, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data84[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x002E, 0x002E, 0x002E, 0x002E, 0x002E, 0x002E, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0xFFFF, 0xFFFF, 0xFFFF, + 0x2032, 0x2032, 0x2032, 0x2032, 0x2032, 0xFFFF, 0x2035, 0x2035, 0x2035, 0x2035, 0x2035, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0021, 0x0021, 0xFFFF, 0x0020, 0xFFFF}; +static unsigned short unicode_unacc_data85[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x005B, 0x005D, 0x003F, 0x003F, 0x003F, + 0x0021, 0x0021, 0x003F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2032, 0x2032, 0x2032, 0x2032, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020}; +static unsigned short unicode_unacc_data86[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0030, 0x0069, 0xFFFF, 0xFFFF, 0x0034, 0x0035, + 0x0036, 0x0037, 0x0038, 0x0039, 0x002B, 0x2212, 0x003D, 0x0028, 0x0029, 0x006E}; +static unsigned short unicode_unacc_data87[] = { + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x002B, + 0x2212, 0x003D, 0x0028, 0x0029, 0xFFFF, 0x0061, 0x0065, 0x006F, 0x0078, 0x0259, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data88[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0052, 0x0073, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data89[] = { + 0x0061, 0x002F, 0x0063, 0x0061, 0x002F, 0x0073, 0x0043, 0x00B0, 0x0043, 0xFFFF, 0x0063, + 0x002F, 0x006F, 0x0063, 0x002F, 0x0075, 0x0190, 0xFFFF, 0x00B0, 0x0046, 0x0067, 0x0048, + 0x0048, 0x0048, 0x0068, 0x0068, 0x0049, 0x0049, 0x004C, 0x006C, 0xFFFF, 0x004E, 0x004E, + 0x006F, 0xFFFF, 0xFFFF, 0x0050, 0x0051, 0x0052, 0x0052, 0x0052, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data90[] = { + 0x0053, 0x004D, 0x0054, 0x0045, 0x004C, 0x0054, 0x004D, 0xFFFF, 0x005A, 0xFFFF, + 0x03A9, 0xFFFF, 0x005A, 0xFFFF, 0x004B, 0x0041, 0x0042, 0x0043, 0xFFFF, 0x0065, + 0x0045, 0x0046, 0xFFFF, 0x004D, 0x006F, 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x0069, + 0xFFFF, 0x0046, 0x0041, 0x0058, 0x03C0, 0x03B3, 0x0393, 0x03A0}; +static unsigned short unicode_unacc_data91[] = { + 0x2211, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0044, 0x0064, 0x0065, 0x0069, 0x006A, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0031, 0x2044, 0x0033, 0x0032, 0x2044, + 0x0033, 0x0031, 0x2044, 0x0035, 0x0032, 0x2044, 0x0035, 0x0033, 0x2044, 0x0035, 0x0034, 0x2044, + 0x0035, 0x0031, 0x2044, 0x0036, 0x0035, 0x2044, 0x0036, 0x0031, 0x2044, 0x0038, 0x0033, 0x2044, + 0x0038, 0x0035, 0x2044, 0x0038, 0x0037, 0x2044, 0x0038, 0x0031, 0x2044}; +static unsigned short unicode_unacc_data92[] = { + 0x0049, 0x0049, 0x0049, 0x0049, 0x0049, 0x0049, 0x0049, 0x0056, 0x0056, 0x0056, 0x0049, 0x0056, + 0x0049, 0x0049, 0x0056, 0x0049, 0x0049, 0x0049, 0x0049, 0x0058, 0x0058, 0x0058, 0x0049, 0x0058, + 0x0049, 0x0049, 0x004C, 0x0043, 0x0044, 0x004D, 0x0069, 0x0069, 0x0069, 0x0069, 0x0069, 0x0069, + 0x0069, 0x0076, 0x0076, 0x0076, 0x0069, 0x0076, 0x0069, 0x0069, 0x0076, 0x0069, 0x0069, 0x0069, + 0x0069, 0x0078, 0x0078, 0x0078, 0x0069, 0x0078, 0x0069, 0x0069, 0x006C, 0x0063, 0x0064, 0x006D}; +static unsigned short unicode_unacc_data93[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2190, 0x2192, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data94[] = { + 0xFFFF, 0xFFFF, 0x2190, 0x2192, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2195, 0x2190, 0x2192, + 0x2190, 0x2192, 0xFFFF, 0x2194, 0xFFFF, 0x2191, 0x2191, 0x2193, 0x2193, 0x2192, 0x2193, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data95[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x21D0, 0x21D4, 0x21D2, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2191, 0x2193}; +static unsigned short unicode_unacc_data96[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x21EB, 0x21EB, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x2190, 0x2192, 0x2194, 0x2190, 0x2192, 0x2194, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data97[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2203, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2208, 0xFFFF, + 0xFFFF, 0x220B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data98[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2223, 0xFFFF, 0x2225, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x222B, 0x222B, 0x222B, 0x222B, 0x222B, 0xFFFF, 0x222E, 0x222E, + 0x222E, 0x222E, 0x222E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data99[] = { + 0xFFFF, 0x223C, 0xFFFF, 0xFFFF, 0x2243, 0xFFFF, 0xFFFF, 0x2245, 0xFFFF, 0x2248, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data100[] = { + 0x003D, 0xFFFF, 0x2261, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x224D, 0x003C, 0x003E, 0x2264, 0x2265, 0xFFFF, 0xFFFF, 0x2272, 0x2273, + 0xFFFF, 0xFFFF, 0x2276, 0x2277, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data101[] = { + 0x227A, 0x227B, 0xFFFF, 0xFFFF, 0x2282, 0x2283, 0xFFFF, 0xFFFF, 0x2286, 0x2287, 0x2282, + 0x2283, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data102[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x22A2, 0x22A8, 0x22A9, 0x22AB, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x221F, 0xFFFF}; +static unsigned short unicode_unacc_data103[] = { + 0x227C, 0x227D, 0x2291, 0x2292, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x22B2, + 0x22B3, 0x22B4, 0x22B5, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2208, 0x2208, 0x220A, 0x2208, + 0x2208, 0x220A, 0x2208, 0x2208, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data104[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x3008, 0x3009, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data105[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x221F, 0xFFFF, 0xFFFF, 0x007C}; +static unsigned short unicode_unacc_data106[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x25A1, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data107[] = { + 0xFFFF, 0x23C9, 0x23CA, 0xFFFF, 0x23C9, 0x23CA, 0xFFFF, 0x23C9, 0x23CA, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data108[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x232C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data109[] = { + 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x0031, 0x0030, 0x0031, + 0x0031, 0x0031, 0x0032, 0x0031, 0x0033, 0x0031, 0x0034, 0x0031, 0x0035, 0x0031, 0x0036, 0x0031, + 0x0037, 0x0031, 0x0038, 0x0031, 0x0039, 0x0032, 0x0030, 0x0028, 0x0031, 0x0029, 0x0028, 0x0032, + 0x0029, 0x0028, 0x0033, 0x0029, 0x0028, 0x0034, 0x0029, 0x0028, 0x0035, 0x0029, 0x0028, 0x0036, + 0x0029, 0x0028, 0x0037, 0x0029, 0x0028, 0x0038, 0x0029, 0x0028, 0x0039, 0x0029, 0x0028, 0x0031, + 0x0030, 0x0029, 0x0028, 0x0031, 0x0031, 0x0029, 0x0028, 0x0031, 0x0032, 0x0029}; +static unsigned short unicode_unacc_data110[] = { + 0x0028, 0x0031, 0x0033, 0x0029, 0x0028, 0x0031, 0x0034, 0x0029, 0x0028, 0x0031, 0x0035, 0x0029, + 0x0028, 0x0031, 0x0036, 0x0029, 0x0028, 0x0031, 0x0037, 0x0029, 0x0028, 0x0031, 0x0038, 0x0029, + 0x0028, 0x0031, 0x0039, 0x0029, 0x0028, 0x0032, 0x0030, 0x0029, 0x0031, 0x002E, 0x0032, 0x002E, + 0x0033, 0x002E, 0x0034, 0x002E, 0x0035, 0x002E, 0x0036, 0x002E, 0x0037, 0x002E, 0x0038, 0x002E, + 0x0039, 0x002E, 0x0031, 0x0030, 0x002E, 0x0031, 0x0031, 0x002E, 0x0031, 0x0032, 0x002E, 0x0031, + 0x0033, 0x002E, 0x0031, 0x0034, 0x002E, 0x0031, 0x0035, 0x002E, 0x0031, 0x0036, 0x002E, 0x0031, + 0x0037, 0x002E, 0x0031, 0x0038, 0x002E, 0x0031, 0x0039, 0x002E, 0x0032, 0x0030, 0x002E, 0x0028, + 0x0061, 0x0029, 0x0028, 0x0062, 0x0029, 0x0028, 0x0063, 0x0029, 0x0028, 0x0064, 0x0029}; +static unsigned short unicode_unacc_data111[] = { + 0x0028, 0x0065, 0x0029, 0x0028, 0x0066, 0x0029, 0x0028, 0x0067, 0x0029, 0x0028, 0x0068, + 0x0029, 0x0028, 0x0069, 0x0029, 0x0028, 0x006A, 0x0029, 0x0028, 0x006B, 0x0029, 0x0028, + 0x006C, 0x0029, 0x0028, 0x006D, 0x0029, 0x0028, 0x006E, 0x0029, 0x0028, 0x006F, 0x0029, + 0x0028, 0x0070, 0x0029, 0x0028, 0x0071, 0x0029, 0x0028, 0x0072, 0x0029, 0x0028, 0x0073, + 0x0029, 0x0028, 0x0074, 0x0029, 0x0028, 0x0075, 0x0029, 0x0028, 0x0076, 0x0029, 0x0028, + 0x0077, 0x0029, 0x0028, 0x0078, 0x0029, 0x0028, 0x0079, 0x0029, 0x0028, 0x007A, 0x0029, + 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A}; +static unsigned short unicode_unacc_data112[] = { + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, + 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, + 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070}; +static unsigned short unicode_unacc_data113[] = { + 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x0030, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data114[] = { + 0xFFFF, 0xFFFF, 0x25A1, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data115[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x25A1, 0x25B3, 0xFFFF, 0xFFFF, 0xFFFF, 0x25A1, 0x25A1, 0x25A1, 0x25A1, 0x25CB, 0x25CB, + 0x25CB, 0x25CB, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data116[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2610, 0x2610, 0xFFFF, 0x2602, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data117[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x25CB, 0x25CB, 0x25CF, 0x25CF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data118[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x25C7, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x22A5}; +static unsigned short unicode_unacc_data119[] = { + 0xFFFF, 0xFFFF, 0x27E1, 0x27E1, 0x25A1, 0x25A1, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data120[] = { + 0xFFFF, 0xFFFF, 0x21D0, 0x21D2, 0x21D4, 0xFFFF, 0xFFFF, 0xFFFF, 0x2193, 0x2191, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2192, 0xFFFF, 0xFFFF, 0x2192, 0x2192, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data121[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x2196, 0x2197, 0x2198, 0x2199, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x293A, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data122[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2192, 0x2190, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data123[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x005B, 0x005D, 0x005B, 0x005D, 0x005B, 0x005D, 0x3008, 0x3009, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2220, 0xFFFF}; +static unsigned short unicode_unacc_data124[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2220, 0x29A3, 0xFFFF, 0xFFFF, 0x2221, 0x2221, 0x2221, + 0x2221, 0x2221, 0x2221, 0x2221, 0x2221, 0xFFFF, 0x2205, 0x2205, 0x2205, 0x2205, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data125[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x22C8, 0x22C8, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data126[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x29E3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x25C6, + 0xFFFF, 0x25CB, 0x25CF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x002F, 0x005C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data127[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x222B, 0x222B, 0x222B, 0x222B, 0xFFFF, 0x222B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x222B, 0x222B, 0x222B, 0x222B, 0x222B, 0x222B, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data128[] = { + 0xFFFF, 0xFFFF, 0x002B, 0x002B, 0x002B, 0x002B, 0x002B, 0x002B, 0x002B, 0x2212, 0x2212, + 0x2212, 0x2212, 0xFFFF, 0xFFFF, 0xFFFF, 0x00D7, 0x00D7, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data129[] = { + 0x2229, 0x222A, 0x222A, 0x2229, 0x2229, 0x222A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2227, 0x2228, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2227, 0x2228, 0x2227, 0x2228, 0x2227, 0x2227}; +static unsigned short unicode_unacc_data130[] = { + 0x2227, 0xFFFF, 0x2228, 0x2228, 0xFFFF, 0xFFFF, 0x003D, 0xFFFF, 0xFFFF, 0xFFFF, + 0x223C, 0x223C, 0xFFFF, 0xFFFF, 0xFFFF, 0x2248, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x003A, 0x003A, 0x003D, 0x003D, 0x003D, 0x003D, 0x003D, 0x003D, 0x003D, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2A7D}; +static unsigned short unicode_unacc_data131[] = { + 0x2A7E, 0x2A7D, 0x2A7E, 0x2A7D, 0x2A7E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x2A95, 0x2A96, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data132[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x2AA1, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x003D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data133[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0x2286, 0x2287, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x22D4, 0xFFFF, 0x2ADD, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data134[] = { + 0xFFFF, 0x27C2, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2ADF, 0x2AE0, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x2223, 0x007C, 0x007C, 0x22A4, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data135[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0x2192, 0x2192, 0x2190, 0x2190, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data136[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0x2190, 0x2190, 0x2190, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data137[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C24, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data138[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2C54, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data139[] = { + 0x004C, 0x006C, 0x004C, 0x0050, 0x0052, 0x0061, 0x0074, 0x0048, 0x0068, 0x004B, 0x006B, + 0x005A, 0x007A, 0xFFFF, 0x004D, 0xFFFF, 0xFFFF, 0x0076, 0x0057, 0x0077, 0x0076, 0xFFFF, + 0xFFFF, 0xFFFF, 0x0065, 0x0279, 0x006F, 0xFFFF, 0x006A, 0x0056, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data140[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2D61, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data141[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2010, 0x007E, 0xFFFF, 0xFFFF, 0x007E, 0x007E}; +static unsigned short unicode_unacc_data142[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x6BCD}; +static unsigned short unicode_unacc_data143[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x9F9F, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data144[] = { + 0x4E00, 0x4E28, 0x4E36, 0x4E3F, 0x4E59, 0x4E85, 0x4E8C, 0x4EA0, 0x4EBA, 0x513F, 0x5165, + 0x516B, 0x5182, 0x5196, 0x51AB, 0x51E0, 0x51F5, 0x5200, 0x529B, 0x52F9, 0x5315, 0x531A, + 0x5338, 0x5341, 0x535C, 0x5369, 0x5382, 0x53B6, 0x53C8, 0x53E3, 0x56D7, 0x571F}; +static unsigned short unicode_unacc_data145[] = { + 0x58EB, 0x5902, 0x590A, 0x5915, 0x5927, 0x5973, 0x5B50, 0x5B80, 0x5BF8, 0x5C0F, 0x5C22, + 0x5C38, 0x5C6E, 0x5C71, 0x5DDB, 0x5DE5, 0x5DF1, 0x5DFE, 0x5E72, 0x5E7A, 0x5E7F, 0x5EF4, + 0x5EFE, 0x5F0B, 0x5F13, 0x5F50, 0x5F61, 0x5F73, 0x5FC3, 0x6208, 0x6236, 0x624B}; +static unsigned short unicode_unacc_data146[] = { + 0x652F, 0x6534, 0x6587, 0x6597, 0x65A4, 0x65B9, 0x65E0, 0x65E5, 0x66F0, 0x6708, 0x6728, + 0x6B20, 0x6B62, 0x6B79, 0x6BB3, 0x6BCB, 0x6BD4, 0x6BDB, 0x6C0F, 0x6C14, 0x6C34, 0x706B, + 0x722A, 0x7236, 0x723B, 0x723F, 0x7247, 0x7259, 0x725B, 0x72AC, 0x7384, 0x7389}; +static unsigned short unicode_unacc_data147[] = { + 0x74DC, 0x74E6, 0x7518, 0x751F, 0x7528, 0x7530, 0x758B, 0x7592, 0x7676, 0x767D, 0x76AE, + 0x76BF, 0x76EE, 0x77DB, 0x77E2, 0x77F3, 0x793A, 0x79B8, 0x79BE, 0x7A74, 0x7ACB, 0x7AF9, + 0x7C73, 0x7CF8, 0x7F36, 0x7F51, 0x7F8A, 0x7FBD, 0x8001, 0x800C, 0x8012, 0x8033}; +static unsigned short unicode_unacc_data148[] = { + 0x807F, 0x8089, 0x81E3, 0x81EA, 0x81F3, 0x81FC, 0x820C, 0x821B, 0x821F, 0x826E, 0x8272, + 0x8278, 0x864D, 0x866B, 0x8840, 0x884C, 0x8863, 0x897E, 0x898B, 0x89D2, 0x8A00, 0x8C37, + 0x8C46, 0x8C55, 0x8C78, 0x8C9D, 0x8D64, 0x8D70, 0x8DB3, 0x8EAB, 0x8ECA, 0x8F9B}; +static unsigned short unicode_unacc_data149[] = { + 0x8FB0, 0x8FB5, 0x9091, 0x9149, 0x91C6, 0x91CC, 0x91D1, 0x9577, 0x9580, 0x961C, 0x96B6, + 0x96B9, 0x96E8, 0x9751, 0x975E, 0x9762, 0x9769, 0x97CB, 0x97ED, 0x97F3, 0x9801, 0x98A8, + 0x98DB, 0x98DF, 0x9996, 0x9999, 0x99AC, 0x9AA8, 0x9AD8, 0x9ADF, 0x9B25, 0x9B2F}; +static unsigned short unicode_unacc_data150[] = { + 0x9B32, 0x9B3C, 0x9B5A, 0x9CE5, 0x9E75, 0x9E7F, 0x9EA5, 0x9EBB, 0x9EC3, 0x9ECD, 0x9ED1, + 0x9EF9, 0x9EFD, 0x9F0E, 0x9F13, 0x9F20, 0x9F3B, 0x9F4A, 0x9F52, 0x9F8D, 0x9F9C, 0x9FA0, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data151[] = { + 0x0020, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data152[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x3012, 0xFFFF, 0x5341, 0x5344, 0x5345, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data153[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x304B, 0xFFFF, 0x304D, 0xFFFF, 0x304F, 0xFFFF, 0x3051, 0xFFFF, 0x3053, 0xFFFF, + 0x3055, 0xFFFF, 0x3057, 0xFFFF, 0x3059, 0xFFFF, 0x305B, 0xFFFF, 0x305D, 0xFFFF}; +static unsigned short unicode_unacc_data154[] = { + 0x305F, 0xFFFF, 0x3061, 0xFFFF, 0xFFFF, 0x3064, 0xFFFF, 0x3066, 0xFFFF, 0x3068, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x306F, 0x306F, 0xFFFF, 0x3072, 0x3072, 0xFFFF, + 0x3075, 0x3075, 0xFFFF, 0x3078, 0x3078, 0xFFFF, 0x307B, 0x307B, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data155[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x3046, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0x0020, 0xFFFF, 0x309D, 0x3088, 0x308A}; +static unsigned short unicode_unacc_data156[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0x30AB, 0xFFFF, 0x30AD, 0xFFFF, 0x30AF, 0xFFFF, 0x30B1, 0xFFFF, 0x30B3, 0xFFFF, + 0x30B5, 0xFFFF, 0x30B7, 0xFFFF, 0x30B9, 0xFFFF, 0x30BB, 0xFFFF, 0x30BD, 0xFFFF}; +static unsigned short unicode_unacc_data157[] = { + 0x30BF, 0xFFFF, 0x30C1, 0xFFFF, 0xFFFF, 0x30C4, 0xFFFF, 0x30C6, 0xFFFF, 0x30C8, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x30CF, 0x30CF, 0xFFFF, 0x30D2, 0x30D2, 0xFFFF, + 0x30D5, 0x30D5, 0xFFFF, 0x30D8, 0x30D8, 0xFFFF, 0x30DB, 0x30DB, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data158[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x30A6, 0xFFFF, + 0xFFFF, 0x30EF, 0x30F0, 0x30F1, 0x30F2, 0xFFFF, 0xFFFF, 0xFFFF, 0x30FD, 0x30B3, 0x30C8}; +static unsigned short unicode_unacc_data159[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x1100, 0x1101, 0x11AA, 0x1102, 0x11AC, + 0x11AD, 0x1103, 0x1104, 0x1105, 0x11B0, 0x11B1, 0x11B2, 0x11B3, 0x11B4, 0x11B5}; +static unsigned short unicode_unacc_data160[] = { + 0x111A, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, 0x110A, 0x110B, 0x110C, 0x110D, 0x110E, + 0x110F, 0x1110, 0x1111, 0x1112, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0x1167, + 0x1168, 0x1169, 0x116A, 0x116B, 0x116C, 0x116D, 0x116E, 0x116F, 0x1170, 0x1171}; +static unsigned short unicode_unacc_data161[] = { + 0x1172, 0x1173, 0x1174, 0x1175, 0x1160, 0x1114, 0x1115, 0x11C7, 0x11C8, 0x11CC, 0x11CE, + 0x11D3, 0x11D7, 0x11D9, 0x111C, 0x11DD, 0x11DF, 0x111D, 0x111E, 0x1120, 0x1122, 0x1123, + 0x1127, 0x1129, 0x112B, 0x112C, 0x112D, 0x112E, 0x112F, 0x1132, 0x1136, 0x1140}; +static unsigned short unicode_unacc_data162[] = { + 0x1147, 0x114C, 0x11F1, 0x11F2, 0x1157, 0x1158, 0x1159, 0x1184, 0x1185, 0x1188, 0x1191, + 0x1192, 0x1194, 0x119E, 0x11A1, 0xFFFF, 0xFFFF, 0xFFFF, 0x4E00, 0x4E8C, 0x4E09, 0x56DB, + 0x4E0A, 0x4E2D, 0x4E0B, 0x7532, 0x4E59, 0x4E19, 0x4E01, 0x5929, 0x5730, 0x4EBA}; +static unsigned short unicode_unacc_data163[] = { + 0x0028, 0x1100, 0x0029, 0x0028, 0x1102, 0x0029, 0x0028, 0x1103, 0x0029, 0x0028, 0x1105, 0x0029, + 0x0028, 0x1106, 0x0029, 0x0028, 0x1107, 0x0029, 0x0028, 0x1109, 0x0029, 0x0028, 0x110B, 0x0029, + 0x0028, 0x110C, 0x0029, 0x0028, 0x110E, 0x0029, 0x0028, 0x110F, 0x0029, 0x0028, 0x1110, 0x0029, + 0x0028, 0x1111, 0x0029, 0x0028, 0x1112, 0x0029, 0x0028, 0x1100, 0x1161, 0x0029, 0x0028, 0x1102, + 0x1161, 0x0029, 0x0028, 0x1103, 0x1161, 0x0029, 0x0028, 0x1105, 0x1161, 0x0029, 0x0028, 0x1106, + 0x1161, 0x0029, 0x0028, 0x1107, 0x1161, 0x0029, 0x0028, 0x1109, 0x1161, 0x0029, 0x0028, 0x110B, + 0x1161, 0x0029, 0x0028, 0x110C, 0x1161, 0x0029, 0x0028, 0x110E, 0x1161, 0x0029, 0x0028, 0x110F, + 0x1161, 0x0029, 0x0028, 0x1110, 0x1161, 0x0029, 0x0028, 0x1111, 0x1161, 0x0029, 0x0028, 0x1112, + 0x1161, 0x0029, 0x0028, 0x110C, 0x116E, 0x0029, 0x0028, 0x110B, 0x1169, 0x110C, 0x1165, 0x11AB, + 0x0029, 0x0028, 0x110B, 0x1169, 0x1112, 0x116E, 0x0029, 0xFFFF}; +static unsigned short unicode_unacc_data164[] = { + 0x0028, 0x4E00, 0x0029, 0x0028, 0x4E8C, 0x0029, 0x0028, 0x4E09, 0x0029, 0x0028, 0x56DB, 0x0029, + 0x0028, 0x4E94, 0x0029, 0x0028, 0x516D, 0x0029, 0x0028, 0x4E03, 0x0029, 0x0028, 0x516B, 0x0029, + 0x0028, 0x4E5D, 0x0029, 0x0028, 0x5341, 0x0029, 0x0028, 0x6708, 0x0029, 0x0028, 0x706B, 0x0029, + 0x0028, 0x6C34, 0x0029, 0x0028, 0x6728, 0x0029, 0x0028, 0x91D1, 0x0029, 0x0028, 0x571F, 0x0029, + 0x0028, 0x65E5, 0x0029, 0x0028, 0x682A, 0x0029, 0x0028, 0x6709, 0x0029, 0x0028, 0x793E, 0x0029, + 0x0028, 0x540D, 0x0029, 0x0028, 0x7279, 0x0029, 0x0028, 0x8CA1, 0x0029, 0x0028, 0x795D, 0x0029, + 0x0028, 0x52B4, 0x0029, 0x0028, 0x4EE3, 0x0029, 0x0028, 0x547C, 0x0029, 0x0028, 0x5B66, 0x0029, + 0x0028, 0x76E3, 0x0029, 0x0028, 0x4F01, 0x0029, 0x0028, 0x8CC7, 0x0029, 0x0028, 0x5354, 0x0029}; +static unsigned short unicode_unacc_data165[] = { + 0x0028, 0x796D, 0x0029, 0x0028, 0x4F11, 0x0029, 0x0028, 0x81EA, 0x0029, 0x0028, 0x81F3, 0x0029, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0x0050, 0x0054, 0x0045, 0x0032, 0x0031, 0x0032, 0x0032, 0x0032, 0x0033, 0x0032, 0x0034, 0x0032, + 0x0035, 0x0032, 0x0036, 0x0032, 0x0037, 0x0032, 0x0038, 0x0032, 0x0039, 0x0033, 0x0030, 0x0033, + 0x0031, 0x0033, 0x0032, 0x0033, 0x0033, 0x0033, 0x0034, 0x0033, 0x0035}; +static unsigned short unicode_unacc_data166[] = { + 0x1100, 0x1102, 0x1103, 0x1105, 0x1106, 0x1107, 0x1109, 0x110B, 0x110C, 0x110E, 0x110F, + 0x1110, 0x1111, 0x1112, 0x1100, 0x1161, 0x1102, 0x1161, 0x1103, 0x1161, 0x1105, 0x1161, + 0x1106, 0x1161, 0x1107, 0x1161, 0x1109, 0x1161, 0x110B, 0x1161, 0x110C, 0x1161, 0x110E, + 0x1161, 0x110F, 0x1161, 0x1110, 0x1161, 0x1111, 0x1161, 0x1112, 0x1161, 0x110E, 0x1161, + 0x11B7, 0x1100, 0x1169, 0x110C, 0x116E, 0x110B, 0x1174, 0x110B, 0x116E, 0xFFFF}; +static unsigned short unicode_unacc_data167[] = { + 0x4E00, 0x4E8C, 0x4E09, 0x56DB, 0x4E94, 0x516D, 0x4E03, 0x516B, 0x4E5D, 0x5341, 0x6708, + 0x706B, 0x6C34, 0x6728, 0x91D1, 0x571F, 0x65E5, 0x682A, 0x6709, 0x793E, 0x540D, 0x7279, + 0x8CA1, 0x795D, 0x52B4, 0x79D8, 0x7537, 0x5973, 0x9069, 0x512A, 0x5370, 0x6CE8}; +static unsigned short unicode_unacc_data168[] = { + 0x9805, 0x4F11, 0x5199, 0x6B63, 0x4E0A, 0x4E2D, 0x4E0B, 0x5DE6, 0x53F3, 0x533B, 0x5B97, 0x5B66, + 0x76E3, 0x4F01, 0x8CC7, 0x5354, 0x591C, 0x0033, 0x0036, 0x0033, 0x0037, 0x0033, 0x0038, 0x0033, + 0x0039, 0x0034, 0x0030, 0x0034, 0x0031, 0x0034, 0x0032, 0x0034, 0x0033, 0x0034, 0x0034, 0x0034, + 0x0035, 0x0034, 0x0036, 0x0034, 0x0037, 0x0034, 0x0038, 0x0034, 0x0039, 0x0035, 0x0030}; +static unsigned short unicode_unacc_data169[] = { + 0x0031, 0x6708, 0x0032, 0x6708, 0x0033, 0x6708, 0x0034, 0x6708, 0x0035, 0x6708, 0x0036, + 0x6708, 0x0037, 0x6708, 0x0038, 0x6708, 0x0039, 0x6708, 0x0031, 0x0030, 0x6708, 0x0031, + 0x0031, 0x6708, 0x0031, 0x0032, 0x6708, 0x0048, 0x0067, 0x0065, 0x0072, 0x0067, 0x0065, + 0x0056, 0x004C, 0x0054, 0x0044, 0x30A2, 0x30A4, 0x30A6, 0x30A8, 0x30AA, 0x30AB, 0x30AD, + 0x30AF, 0x30B1, 0x30B3, 0x30B5, 0x30B7, 0x30B9, 0x30BB, 0x30BD, 0x30BF}; +static unsigned short unicode_unacc_data170[] = { + 0x30C1, 0x30C4, 0x30C6, 0x30C8, 0x30CA, 0x30CB, 0x30CC, 0x30CD, 0x30CE, 0x30CF, 0x30D2, + 0x30D5, 0x30D8, 0x30DB, 0x30DE, 0x30DF, 0x30E0, 0x30E1, 0x30E2, 0x30E4, 0x30E6, 0x30E8, + 0x30E9, 0x30EA, 0x30EB, 0x30EC, 0x30ED, 0x30EF, 0x30F0, 0x30F1, 0x30F2, 0xFFFF}; +static unsigned short unicode_unacc_data171[] = { + 0x30A2, 0x30FC, 0x30C8, 0x30CF, 0x30A2, 0x30EB, 0x30D5, 0x30A1, 0x30A2, 0x30F3, 0x30A2, 0x30D8, + 0x30A2, 0x30FC, 0x30EB, 0x30A4, 0x30CB, 0x30F3, 0x30AF, 0x30A4, 0x30F3, 0x30C1, 0x30A6, 0x30A9, + 0x30F3, 0x30A8, 0x30B9, 0x30AF, 0x30FC, 0x30C8, 0x30A8, 0x30FC, 0x30AB, 0x30FC, 0x30AA, 0x30F3, + 0x30B9, 0x30AA, 0x30FC, 0x30E0, 0x30AB, 0x30A4, 0x30EA, 0x30AB, 0x30E9, 0x30C3, 0x30C8, 0x30AB, + 0x30ED, 0x30EA, 0x30FC, 0x30ED, 0x30F3, 0x30AB, 0x30F3, 0x30DE, 0x30AB, 0x30AD, 0x30AB, 0x30CB, + 0x30FC, 0x30AD, 0x30AD, 0x30E5, 0x30EA, 0x30FC, 0x30EB, 0x30FC, 0x30AD, 0x30BF, 0x30AD, 0x30ED, + 0x30AD, 0x30ED, 0x30E9, 0x30E0, 0x30AF, 0x30AD, 0x30ED, 0x30E1, 0x30FC, 0x30C8, 0x30EB, 0x30AD, + 0x30ED, 0x30EF, 0x30C3, 0x30C8, 0x30E9, 0x30E0, 0x30AF, 0x30E9, 0x30E0, 0x30C8, 0x30F3, 0x30AF, + 0x30AF, 0x30EB, 0x30A4, 0x30ED, 0x30BB, 0x30AF, 0x30ED, 0x30FC, 0x30CD, 0x30B1, 0x30FC, 0x30B9, + 0x30B3, 0x30EB, 0x30CA, 0x30B3, 0x30FC, 0x30DB, 0x30B5, 0x30A4, 0x30AF, 0x30EB}; +static unsigned short unicode_unacc_data172[] = { + 0x30B5, 0x30F3, 0x30C1, 0x30FC, 0x30E0, 0x30B7, 0x30EA, 0x30F3, 0x30AF, 0x30BB, 0x30F3, 0x30C1, + 0x30BB, 0x30F3, 0x30C8, 0x30FC, 0x30B9, 0x30BF, 0x30B7, 0x30C6, 0x30EB, 0x30C8, 0x30C8, 0x30F3, + 0x30CA, 0x30CE, 0x30CE, 0x30C3, 0x30C8, 0x30CF, 0x30A4, 0x30C4, 0x30FC, 0x30BB, 0x30F3, 0x30C8, + 0x30CF, 0x30FC, 0x30C4, 0x30CF, 0x30FC, 0x30EC, 0x30EB, 0x30CF, 0x30A2, 0x30B9, 0x30C8, 0x30EB, + 0x30D2, 0x30AF, 0x30EB, 0x30D2, 0x30B3, 0x30D2, 0x30EB, 0x30D2, 0x30D5, 0x30A1, 0x30E9, 0x30C3, + 0x30C8, 0x30D5, 0x30A3, 0x30FC, 0x30C8, 0x30C3, 0x30B7, 0x30A7, 0x30EB, 0x30D5, 0x30D5, 0x30E9, + 0x30F3, 0x30D8, 0x30AF, 0x30BF, 0x30FC, 0x30EB, 0x30BD, 0x30D8, 0x30CB, 0x30D2, 0x30D8, 0x30D8, + 0x30EB, 0x30C4, 0x30F3, 0x30B9, 0x30D8, 0x30FC, 0x30D8, 0x30B7, 0x30FC, 0x30BF, 0x30D8, 0x30A4, + 0x30F3, 0x30C8, 0x30DB, 0x30EB, 0x30C8, 0x30DB, 0x30DB, 0x30F3}; +static unsigned short unicode_unacc_data173[] = { + 0x30F3, 0x30DB, 0x30C8, 0x30DB, 0x30FC, 0x30EB, 0x30DB, 0x30FC, 0x30F3, 0x30DE, 0x30A4, 0x30AF, + 0x30ED, 0x30DE, 0x30A4, 0x30EB, 0x30DE, 0x30C3, 0x30CF, 0x30DE, 0x30EB, 0x30AF, 0x30DE, 0x30F3, + 0x30B7, 0x30E7, 0x30F3, 0x30DF, 0x30AF, 0x30ED, 0x30F3, 0x30DF, 0x30EA, 0x30DF, 0x30EA, 0x30FC, + 0x30EB, 0x30CF, 0x30E1, 0x30AB, 0x30E1, 0x30C8, 0x30F3, 0x30AB, 0x30E1, 0x30FC, 0x30C8, 0x30EB, + 0x30E4, 0x30FC, 0x30C8, 0x30E4, 0x30FC, 0x30EB, 0x30E6, 0x30A2, 0x30F3, 0x30EA, 0x30C3, 0x30C8, + 0x30EB, 0x30EA, 0x30E9, 0x30EB, 0x30FC, 0x30D2, 0x30EB, 0x30FC, 0x30EB, 0x30D5, 0x30EC, 0x30E0, + 0x30EC, 0x30F3, 0x30C8, 0x30F3, 0x30B1, 0x30EF, 0x30C3, 0x30C8, 0x0030, 0x70B9, 0x0031, 0x70B9, + 0x0032, 0x70B9, 0x0033, 0x70B9, 0x0034, 0x70B9, 0x0035, 0x70B9, 0x0036, 0x70B9, 0x0037, 0x70B9}; +static unsigned short unicode_unacc_data174[] = { + 0x0038, 0x70B9, 0x0039, 0x70B9, 0x0031, 0x0030, 0x70B9, 0x0031, 0x0031, 0x70B9, 0x0031, + 0x0032, 0x70B9, 0x0031, 0x0033, 0x70B9, 0x0031, 0x0034, 0x70B9, 0x0031, 0x0035, 0x70B9, + 0x0031, 0x0036, 0x70B9, 0x0031, 0x0037, 0x70B9, 0x0031, 0x0038, 0x70B9, 0x0031, 0x0039, + 0x70B9, 0x0032, 0x0030, 0x70B9, 0x0032, 0x0031, 0x70B9, 0x0032, 0x0032, 0x70B9, 0x0032, + 0x0033, 0x70B9, 0x0032, 0x0034, 0x70B9, 0x0068, 0x0050, 0x0061, 0x0064, 0x0061, 0x0041, + 0x0055, 0x0062, 0x0061, 0x0072, 0x006F, 0x0056, 0x0070, 0x0063, 0x0064, 0x006D, 0x0064, + 0x006D, 0x0032, 0x0064, 0x006D, 0x0033, 0x0049, 0x0055, 0x5E73, 0x6210, 0x662D, 0x548C, + 0x5927, 0x6B63, 0x660E, 0x6CBB, 0x682A, 0x5F0F, 0x4F1A, 0x793E}; +static unsigned short unicode_unacc_data175[] = { + 0x0070, 0x0041, 0x006E, 0x0041, 0x03BC, 0x0041, 0x006D, 0x0041, 0x006B, 0x0041, 0x004B, 0x0042, + 0x004D, 0x0042, 0x0047, 0x0042, 0x0063, 0x0061, 0x006C, 0x006B, 0x0063, 0x0061, 0x006C, 0x0070, + 0x0046, 0x006E, 0x0046, 0x03BC, 0x0046, 0x03BC, 0x0067, 0x006D, 0x0067, 0x006B, 0x0067, 0x0048, + 0x007A, 0x006B, 0x0048, 0x007A, 0x004D, 0x0048, 0x007A, 0x0047, 0x0048, 0x007A, 0x0054, 0x0048, + 0x007A, 0x03BC, 0x006C, 0x006D, 0x006C, 0x0064, 0x006C, 0x006B, 0x006C, 0x0066, 0x006D, 0x006E, + 0x006D, 0x03BC, 0x006D, 0x006D, 0x006D, 0x0063, 0x006D, 0x006B, 0x006D, 0x006D, 0x006D, 0x0032}; +static unsigned short unicode_unacc_data176[] = { + 0x0063, 0x006D, 0x0032, 0x006D, 0x0032, 0x006B, 0x006D, 0x0032, 0x006D, 0x006D, 0x0033, 0x0063, + 0x006D, 0x0033, 0x006D, 0x0033, 0x006B, 0x006D, 0x0033, 0x006D, 0x2215, 0x0073, 0x006D, 0x2215, + 0x0073, 0x0032, 0x0050, 0x0061, 0x006B, 0x0050, 0x0061, 0x004D, 0x0050, 0x0061, 0x0047, 0x0050, + 0x0061, 0x0072, 0x0061, 0x0064, 0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x0072, 0x0061, 0x0064, + 0x2215, 0x0073, 0x0032, 0x0070, 0x0073, 0x006E, 0x0073, 0x03BC, 0x0073, 0x006D, 0x0073, 0x0070, + 0x0056, 0x006E, 0x0056, 0x03BC, 0x0056, 0x006D, 0x0056, 0x006B, 0x0056, 0x004D, 0x0056, 0x0070, + 0x0057, 0x006E, 0x0057, 0x03BC, 0x0057, 0x006D, 0x0057, 0x006B, 0x0057, 0x004D, 0x0057}; +static unsigned short unicode_unacc_data177[] = { + 0x006B, 0x03A9, 0x004D, 0x03A9, 0x0061, 0x002E, 0x006D, 0x002E, 0x0042, 0x0071, 0x0063, + 0x0063, 0x0063, 0x0064, 0x0043, 0x2215, 0x006B, 0x0067, 0x0043, 0x006F, 0x002E, 0x0064, + 0x0042, 0x0047, 0x0079, 0x0068, 0x0061, 0x0048, 0x0050, 0x0069, 0x006E, 0x004B, 0x004B, + 0x004B, 0x004D, 0x006B, 0x0074, 0x006C, 0x006D, 0x006C, 0x006E, 0x006C, 0x006F, 0x0067, + 0x006C, 0x0078, 0x006D, 0x0062, 0x006D, 0x0069, 0x006C, 0x006D, 0x006F, 0x006C, 0x0050, + 0x0048, 0x0070, 0x002E, 0x006D, 0x002E, 0x0050, 0x0050, 0x004D, 0x0050, 0x0052, 0x0073, + 0x0072, 0x0053, 0x0076, 0x0057, 0x0062, 0x0056, 0x2215, 0x006D, 0x0041, 0x2215, 0x006D}; +static unsigned short unicode_unacc_data178[] = { + 0x0031, 0x65E5, 0x0032, 0x65E5, 0x0033, 0x65E5, 0x0034, 0x65E5, 0x0035, 0x65E5, 0x0036, + 0x65E5, 0x0037, 0x65E5, 0x0038, 0x65E5, 0x0039, 0x65E5, 0x0031, 0x0030, 0x65E5, 0x0031, + 0x0031, 0x65E5, 0x0031, 0x0032, 0x65E5, 0x0031, 0x0033, 0x65E5, 0x0031, 0x0034, 0x65E5, + 0x0031, 0x0035, 0x65E5, 0x0031, 0x0036, 0x65E5, 0x0031, 0x0037, 0x65E5, 0x0031, 0x0038, + 0x65E5, 0x0031, 0x0039, 0x65E5, 0x0032, 0x0030, 0x65E5, 0x0032, 0x0031, 0x65E5, 0x0032, + 0x0032, 0x65E5, 0x0032, 0x0033, 0x65E5, 0x0032, 0x0034, 0x65E5, 0x0032, 0x0035, 0x65E5, + 0x0032, 0x0036, 0x65E5, 0x0032, 0x0037, 0x65E5, 0x0032, 0x0038, 0x65E5, 0x0032, 0x0039, + 0x65E5, 0x0033, 0x0030, 0x65E5, 0x0033, 0x0031, 0x65E5, 0x0067, 0x0061, 0x006C}; +static unsigned short unicode_unacc_data179[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x042B, 0x044B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data180[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0422, + 0x0442, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data181[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xA72C, 0xA72D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xA738, 0xA739, 0xFFFF, 0xFFFF, 0xFFFF, 0x2184}; +static unsigned short unicode_unacc_data182[] = { + 0x004B, 0x006B, 0x004B, 0x006B, 0x004B, 0x006B, 0xFFFF, 0xFFFF, 0x004C, 0x006C, 0x004F, + 0x006F, 0x004F, 0x006F, 0xFFFF, 0xFFFF, 0x0050, 0x0070, 0x0050, 0x0070, 0x0050, 0x0070, + 0x0051, 0x0071, 0x0051, 0x0071, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0056, 0x0076}; +static unsigned short unicode_unacc_data183[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x00DE, 0x00FE, 0x00DE, 0x00FE, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xA76F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data184[] = { + 0x8C48, 0x66F4, 0x8ECA, 0x8CC8, 0x6ED1, 0x4E32, 0x53E5, 0x9F9C, 0x9F9C, 0x5951, 0x91D1, + 0x5587, 0x5948, 0x61F6, 0x7669, 0x7F85, 0x863F, 0x87BA, 0x88F8, 0x908F, 0x6A02, 0x6D1B, + 0x70D9, 0x73DE, 0x843D, 0x916A, 0x99F1, 0x4E82, 0x5375, 0x6B04, 0x721B, 0x862D}; +static unsigned short unicode_unacc_data185[] = { + 0x9E1E, 0x5D50, 0x6FEB, 0x85CD, 0x8964, 0x62C9, 0x81D8, 0x881F, 0x5ECA, 0x6717, 0x6D6A, + 0x72FC, 0x90CE, 0x4F86, 0x51B7, 0x52DE, 0x64C4, 0x6AD3, 0x7210, 0x76E7, 0x8001, 0x8606, + 0x865C, 0x8DEF, 0x9732, 0x9B6F, 0x9DFA, 0x788C, 0x797F, 0x7DA0, 0x83C9, 0x9304}; +static unsigned short unicode_unacc_data186[] = { + 0x9E7F, 0x8AD6, 0x58DF, 0x5F04, 0x7C60, 0x807E, 0x7262, 0x78CA, 0x8CC2, 0x96F7, 0x58D8, + 0x5C62, 0x6A13, 0x6DDA, 0x6F0F, 0x7D2F, 0x7E37, 0x964B, 0x52D2, 0x808B, 0x51DC, 0x51CC, + 0x7A1C, 0x7DBE, 0x83F1, 0x9675, 0x8B80, 0x62CF, 0x6A02, 0x8AFE, 0x4E39, 0x5BE7}; +static unsigned short unicode_unacc_data187[] = { + 0x6012, 0x7387, 0x7570, 0x5317, 0x78FB, 0x4FBF, 0x5FA9, 0x4E0D, 0x6CCC, 0x6578, 0x7D22, + 0x53C3, 0x585E, 0x7701, 0x8449, 0x8AAA, 0x6BBA, 0x8FB0, 0x6C88, 0x62FE, 0x82E5, 0x63A0, + 0x7565, 0x4EAE, 0x5169, 0x51C9, 0x6881, 0x7CE7, 0x826F, 0x8AD2, 0x91CF, 0x52F5}; +static unsigned short unicode_unacc_data188[] = { + 0x5442, 0x5973, 0x5EEC, 0x65C5, 0x6FFE, 0x792A, 0x95AD, 0x9A6A, 0x9E97, 0x9ECE, 0x529B, + 0x66C6, 0x6B77, 0x8F62, 0x5E74, 0x6190, 0x6200, 0x649A, 0x6F23, 0x7149, 0x7489, 0x79CA, + 0x7DF4, 0x806F, 0x8F26, 0x84EE, 0x9023, 0x934A, 0x5217, 0x52A3, 0x54BD, 0x70C8}; +static unsigned short unicode_unacc_data189[] = { + 0x88C2, 0x8AAA, 0x5EC9, 0x5FF5, 0x637B, 0x6BAE, 0x7C3E, 0x7375, 0x4EE4, 0x56F9, 0x5BE7, + 0x5DBA, 0x601C, 0x73B2, 0x7469, 0x7F9A, 0x8046, 0x9234, 0x96F6, 0x9748, 0x9818, 0x4F8B, + 0x79AE, 0x91B4, 0x96B8, 0x60E1, 0x4E86, 0x50DA, 0x5BEE, 0x5C3F, 0x6599, 0x6A02}; +static unsigned short unicode_unacc_data190[] = { + 0x71CE, 0x7642, 0x84FC, 0x907C, 0x9F8D, 0x6688, 0x962E, 0x5289, 0x677B, 0x67F3, 0x6D41, + 0x6E9C, 0x7409, 0x7559, 0x786B, 0x7D10, 0x985E, 0x516D, 0x622E, 0x9678, 0x502B, 0x5D19, + 0x6DEA, 0x8F2A, 0x5F8B, 0x6144, 0x6817, 0x7387, 0x9686, 0x5229, 0x540F, 0x5C65}; +static unsigned short unicode_unacc_data191[] = { + 0x6613, 0x674E, 0x68A8, 0x6CE5, 0x7406, 0x75E2, 0x7F79, 0x88CF, 0x88E1, 0x91CC, 0x96E2, + 0x533F, 0x6EBA, 0x541D, 0x71D0, 0x7498, 0x85FA, 0x96A3, 0x9C57, 0x9E9F, 0x6797, 0x6DCB, + 0x81E8, 0x7ACB, 0x7B20, 0x7C92, 0x72C0, 0x7099, 0x8B58, 0x4EC0, 0x8336, 0x523A}; +static unsigned short unicode_unacc_data192[] = { + 0x5207, 0x5EA6, 0x62D3, 0x7CD6, 0x5B85, 0x6D1E, 0x66B4, 0x8F3B, 0x884C, 0x964D, 0x898B, + 0x5ED3, 0x5140, 0x55C0, 0xFFFF, 0xFFFF, 0x585A, 0xFFFF, 0x6674, 0xFFFF, 0xFFFF, 0x51DE, + 0x732A, 0x76CA, 0x793C, 0x795E, 0x7965, 0x798F, 0x9756, 0x7CBE, 0x7FBD, 0xFFFF}; +static unsigned short unicode_unacc_data193[] = { + 0x8612, 0xFFFF, 0x8AF8, 0xFFFF, 0xFFFF, 0x9038, 0x90FD, 0xFFFF, 0xFFFF, 0xFFFF, 0x98EF, + 0x98FC, 0x9928, 0x9DB4, 0xFFFF, 0xFFFF, 0x4FAE, 0x50E7, 0x514D, 0x52C9, 0x52E4, 0x5351, + 0x559D, 0x5606, 0x5668, 0x5840, 0x58A8, 0x5C64, 0x5C6E, 0x6094, 0x6168, 0x618E}; +static unsigned short unicode_unacc_data194[] = { + 0x61F2, 0x654F, 0x65E2, 0x6691, 0x6885, 0x6D77, 0x6E1A, 0x6F22, 0x716E, 0x722B, 0x7422, + 0x7891, 0x793E, 0x7949, 0x7948, 0x7950, 0x7956, 0x795D, 0x798D, 0x798E, 0x7A40, 0x7A81, + 0x7BC0, 0x7DF4, 0x7E09, 0x7E41, 0x7F72, 0x8005, 0x81ED, 0x8279, 0x8279, 0x8457}; +static unsigned short unicode_unacc_data195[] = { + 0x8910, 0x8996, 0x8B01, 0x8B39, 0x8CD3, 0x8D08, 0x8FB6, 0x9038, 0x96E3, 0x97FF, 0x983B, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x4E26, 0x51B5, 0x5168, 0x4F80, 0x5145, 0x5180, + 0x52C7, 0x52FA, 0x559D, 0x5555, 0x5599, 0x55E2, 0x585A, 0x58B3, 0x5944, 0x5954}; +static unsigned short unicode_unacc_data196[] = { + 0x5A62, 0x5B28, 0x5ED2, 0x5ED9, 0x5F69, 0x5FAD, 0x60D8, 0x614E, 0x6108, 0x618E, 0x6160, + 0x61F2, 0x6234, 0x63C4, 0x641C, 0x6452, 0x6556, 0x6674, 0x6717, 0x671B, 0x6756, 0x6B79, + 0x6BBA, 0x6D41, 0x6EDB, 0x6ECB, 0x6F22, 0x701E, 0x716E, 0x77A7, 0x7235, 0x72AF}; +static unsigned short unicode_unacc_data197[] = { + 0x732A, 0x7471, 0x7506, 0x753B, 0x761D, 0x761F, 0x76CA, 0x76DB, 0x76F4, 0x774A, 0x7740, + 0x78CC, 0x7AB1, 0x7BC0, 0x7C7B, 0x7D5B, 0x7DF4, 0x7F3E, 0x8005, 0x8352, 0x83EF, 0x8779, + 0x8941, 0x8986, 0x8996, 0x8ABF, 0x8AF8, 0x8ACB, 0x8B01, 0x8AFE, 0x8AED, 0x8B39}; +static unsigned short unicode_unacc_data198[] = { + 0x8B8A, 0x8D08, 0x8F38, 0x9072, 0x9199, 0x9276, 0x967C, 0x96E3, 0x9756, 0x97DB, 0x97FF, + 0x980B, 0x983B, 0x9B12, 0x9F9C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x4018, 0x4039, 0xFFFF, + 0xFFFF, 0xFFFF, 0x9F43, 0x9F8E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data199[] = { + 0x0066, 0x0066, 0x0066, 0x0069, 0x0066, 0x006C, 0x0066, 0x0066, 0x0069, 0x0066, 0x0066, 0x006C, + 0x0074, 0x0073, 0x0073, 0x0074, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0574, 0x0576, 0x0574, 0x0565, 0x0574, 0x056B, 0x057E, 0x0576, + 0x0574, 0x056D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x05D9, 0xFFFF, 0x05F2}; +static unsigned short unicode_unacc_data200[] = { + 0x05E2, 0x05D0, 0x05D3, 0x05D4, 0x05DB, 0x05DC, 0x05DD, 0x05E8, 0x05EA, 0x002B, 0x05E9, + 0x05E9, 0x05E9, 0x05E9, 0x05D0, 0x05D0, 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, + 0x05D6, 0xFFFF, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0xFFFF, 0x05DE, 0xFFFF}; +static unsigned short unicode_unacc_data201[] = { + 0x05E0, 0x05E1, 0xFFFF, 0x05E3, 0x05E4, 0xFFFF, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, + 0x05D5, 0x05D1, 0x05DB, 0x05E4, 0x05D0, 0x05DC, 0x0671, 0x0671, 0x067B, 0x067B, 0x067B, + 0x067B, 0x067E, 0x067E, 0x067E, 0x067E, 0x0680, 0x0680, 0x0680, 0x0680, 0x067A, 0x067A}; +static unsigned short unicode_unacc_data202[] = { + 0x067A, 0x067A, 0x067F, 0x067F, 0x067F, 0x067F, 0x0679, 0x0679, 0x0679, 0x0679, 0x06A4, + 0x06A4, 0x06A4, 0x06A4, 0x06A6, 0x06A6, 0x06A6, 0x06A6, 0x0684, 0x0684, 0x0684, 0x0684, + 0x0683, 0x0683, 0x0683, 0x0683, 0x0686, 0x0686, 0x0686, 0x0686, 0x0687, 0x0687}; +static unsigned short unicode_unacc_data203[] = { + 0x0687, 0x0687, 0x068D, 0x068D, 0x068C, 0x068C, 0x068E, 0x068E, 0x0688, 0x0688, 0x0698, + 0x0698, 0x0691, 0x0691, 0x06A9, 0x06A9, 0x06A9, 0x06A9, 0x06AF, 0x06AF, 0x06AF, 0x06AF, + 0x06B3, 0x06B3, 0x06B3, 0x06B3, 0x06B1, 0x06B1, 0x06B1, 0x06B1, 0x06BA, 0x06BA}; +static unsigned short unicode_unacc_data204[] = { + 0x06BB, 0x06BB, 0x06BB, 0x06BB, 0x06D5, 0x06D5, 0x06C1, 0x06C1, 0x06C1, 0x06C1, 0x06BE, + 0x06BE, 0x06BE, 0x06BE, 0x06D2, 0x06D2, 0x06D2, 0x06D2, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data205[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x06AD, 0x06AD, 0x06AD, + 0x06AD, 0x06C7, 0x06C7, 0x06C6, 0x06C6, 0x06C8, 0x06C8, 0x06C7, 0x0674, 0x06CB, 0x06CB}; +static unsigned short unicode_unacc_data206[] = { + 0x06C5, 0x06C5, 0x06C9, 0x06C9, 0x06D0, 0x06D0, 0x06D0, 0x06D0, 0x0649, 0x0649, + 0x0627, 0x064A, 0x0627, 0x064A, 0x06D5, 0x064A, 0x06D5, 0x064A, 0x0648, 0x064A, + 0x0648, 0x064A, 0x06C7, 0x064A, 0x06C7, 0x064A, 0x06C6, 0x064A, 0x06C6, 0x064A, + 0x06C8, 0x064A, 0x06C8, 0x064A, 0x06D0, 0x064A, 0x06D0, 0x064A, 0x06D0, 0x064A, + 0x0649, 0x064A, 0x0649, 0x064A, 0x0649, 0x064A, 0x06CC, 0x06CC, 0x06CC, 0x06CC}; +static unsigned short unicode_unacc_data207[] = { + 0x062C, 0x064A, 0x062D, 0x064A, 0x0645, 0x064A, 0x0649, 0x064A, 0x064A, 0x064A, 0x0628, + 0x062C, 0x0628, 0x062D, 0x0628, 0x062E, 0x0628, 0x0645, 0x0628, 0x0649, 0x0628, 0x064A, + 0x062A, 0x062C, 0x062A, 0x062D, 0x062A, 0x062E, 0x062A, 0x0645, 0x062A, 0x0649, 0x062A, + 0x064A, 0x062B, 0x062C, 0x062B, 0x0645, 0x062B, 0x0649, 0x062B, 0x064A, 0x062C, 0x062D, + 0x062C, 0x0645, 0x062D, 0x062C, 0x062D, 0x0645, 0x062E, 0x062C, 0x062E, 0x062D, 0x062E, + 0x0645, 0x0633, 0x062C, 0x0633, 0x062D, 0x0633, 0x062E, 0x0633, 0x0645}; +static unsigned short unicode_unacc_data208[] = { + 0x0635, 0x062D, 0x0635, 0x0645, 0x0636, 0x062C, 0x0636, 0x062D, 0x0636, 0x062E, 0x0636, + 0x0645, 0x0637, 0x062D, 0x0637, 0x0645, 0x0638, 0x0645, 0x0639, 0x062C, 0x0639, 0x0645, + 0x063A, 0x062C, 0x063A, 0x0645, 0x0641, 0x062C, 0x0641, 0x062D, 0x0641, 0x062E, 0x0641, + 0x0645, 0x0641, 0x0649, 0x0641, 0x064A, 0x0642, 0x062D, 0x0642, 0x0645, 0x0642, 0x0649, + 0x0642, 0x064A, 0x0643, 0x0627, 0x0643, 0x062C, 0x0643, 0x062D, 0x0643, 0x062E, 0x0643, + 0x0644, 0x0643, 0x0645, 0x0643, 0x0649, 0x0643, 0x064A, 0x0644, 0x062C}; +static unsigned short unicode_unacc_data209[] = { + 0x0644, 0x062D, 0x0644, 0x062E, 0x0644, 0x0645, 0x0644, 0x0649, 0x0644, 0x064A, 0x0645, 0x062C, + 0x0645, 0x062D, 0x0645, 0x062E, 0x0645, 0x0645, 0x0645, 0x0649, 0x0645, 0x064A, 0x0646, 0x062C, + 0x0646, 0x062D, 0x0646, 0x062E, 0x0646, 0x0645, 0x0646, 0x0649, 0x0646, 0x064A, 0x0647, 0x062C, + 0x0647, 0x0645, 0x0647, 0x0649, 0x0647, 0x064A, 0x064A, 0x062C, 0x064A, 0x062D, 0x064A, 0x062E, + 0x064A, 0x0645, 0x064A, 0x0649, 0x064A, 0x064A, 0x0630, 0x0631, 0x0649, 0x0020, 0x0020}; +static unsigned short unicode_unacc_data210[] = { + 0x0020, 0x0020, 0x0020, 0x0020, 0x0631, 0x064A, 0x0632, 0x064A, 0x0645, 0x064A, 0x0646, 0x064A, + 0x0649, 0x064A, 0x064A, 0x064A, 0x0628, 0x0631, 0x0628, 0x0632, 0x0628, 0x0645, 0x0628, 0x0646, + 0x0628, 0x0649, 0x0628, 0x064A, 0x062A, 0x0631, 0x062A, 0x0632, 0x062A, 0x0645, 0x062A, 0x0646, + 0x062A, 0x0649, 0x062A, 0x064A, 0x062B, 0x0631, 0x062B, 0x0632, 0x062B, 0x0645, 0x062B, 0x0646, + 0x062B, 0x0649, 0x062B, 0x064A, 0x0641, 0x0649, 0x0641, 0x064A, 0x0642, 0x0649, 0x0642, 0x064A}; +static unsigned short unicode_unacc_data211[] = { + 0x0643, 0x0627, 0x0643, 0x0644, 0x0643, 0x0645, 0x0643, 0x0649, 0x0643, 0x064A, 0x0644, + 0x0645, 0x0644, 0x0649, 0x0644, 0x064A, 0x0645, 0x0627, 0x0645, 0x0645, 0x0646, 0x0631, + 0x0646, 0x0632, 0x0646, 0x0645, 0x0646, 0x0646, 0x0646, 0x0649, 0x0646, 0x064A, 0x0649, + 0x064A, 0x0631, 0x064A, 0x0632, 0x064A, 0x0645, 0x064A, 0x0646, 0x064A, 0x0649, 0x064A, + 0x064A, 0x062C, 0x064A, 0x062D, 0x064A, 0x062E, 0x064A, 0x0645, 0x064A, 0x0647, 0x064A, + 0x0628, 0x062C, 0x0628, 0x062D, 0x0628, 0x062E, 0x0628, 0x0645}; +static unsigned short unicode_unacc_data212[] = { + 0x0628, 0x0647, 0x062A, 0x062C, 0x062A, 0x062D, 0x062A, 0x062E, 0x062A, 0x0645, 0x062A, + 0x0647, 0x062B, 0x0645, 0x062C, 0x062D, 0x062C, 0x0645, 0x062D, 0x062C, 0x062D, 0x0645, + 0x062E, 0x062C, 0x062E, 0x0645, 0x0633, 0x062C, 0x0633, 0x062D, 0x0633, 0x062E, 0x0633, + 0x0645, 0x0635, 0x062D, 0x0635, 0x062E, 0x0635, 0x0645, 0x0636, 0x062C, 0x0636, 0x062D, + 0x0636, 0x062E, 0x0636, 0x0645, 0x0637, 0x062D, 0x0638, 0x0645, 0x0639, 0x062C, 0x0639, + 0x0645, 0x063A, 0x062C, 0x063A, 0x0645, 0x0641, 0x062C, 0x0641, 0x062D}; +static unsigned short unicode_unacc_data213[] = { + 0x0641, 0x062E, 0x0641, 0x0645, 0x0642, 0x062D, 0x0642, 0x0645, 0x0643, 0x062C, 0x0643, + 0x062D, 0x0643, 0x062E, 0x0643, 0x0644, 0x0643, 0x0645, 0x0644, 0x062C, 0x0644, 0x062D, + 0x0644, 0x062E, 0x0644, 0x0645, 0x0644, 0x0647, 0x0645, 0x062C, 0x0645, 0x062D, 0x0645, + 0x062E, 0x0645, 0x0645, 0x0646, 0x062C, 0x0646, 0x062D, 0x0646, 0x062E, 0x0646, 0x0645, + 0x0646, 0x0647, 0x0647, 0x062C, 0x0647, 0x0645, 0x0647, 0x064A, 0x062C, 0x064A, 0x062D, + 0x064A, 0x062E, 0x064A, 0x0645, 0x064A, 0x0647, 0x0645, 0x064A}; +static unsigned short unicode_unacc_data214[] = { + 0x0647, 0x064A, 0x0628, 0x0645, 0x0628, 0x0647, 0x062A, 0x0645, 0x062A, 0x0647, 0x062B, + 0x0645, 0x062B, 0x0647, 0x0633, 0x0645, 0x0633, 0x0647, 0x0634, 0x0645, 0x0634, 0x0647, + 0x0643, 0x0644, 0x0643, 0x0645, 0x0644, 0x0645, 0x0646, 0x0645, 0x0646, 0x0647, 0x064A, + 0x0645, 0x064A, 0x0647, 0x0640, 0x0640, 0x0640, 0x0637, 0x0649, 0x0637, 0x064A, 0x0639, + 0x0649, 0x0639, 0x064A, 0x063A, 0x0649, 0x063A, 0x064A, 0x0633, 0x0649, 0x0633, 0x064A, + 0x0634, 0x0649, 0x0634, 0x064A, 0x062D, 0x0649}; +static unsigned short unicode_unacc_data215[] = { + 0x062D, 0x064A, 0x062C, 0x0649, 0x062C, 0x064A, 0x062E, 0x0649, 0x062E, 0x064A, 0x0635, + 0x0649, 0x0635, 0x064A, 0x0636, 0x0649, 0x0636, 0x064A, 0x0634, 0x062C, 0x0634, 0x062D, + 0x0634, 0x062E, 0x0634, 0x0645, 0x0634, 0x0631, 0x0633, 0x0631, 0x0635, 0x0631, 0x0636, + 0x0631, 0x0637, 0x0649, 0x0637, 0x064A, 0x0639, 0x0649, 0x0639, 0x064A, 0x063A, 0x0649, + 0x063A, 0x064A, 0x0633, 0x0649, 0x0633, 0x064A, 0x0634, 0x0649, 0x0634, 0x064A, 0x062D, + 0x0649, 0x062D, 0x064A, 0x062C, 0x0649, 0x062C, 0x064A, 0x062E, 0x0649}; +static unsigned short unicode_unacc_data216[] = { + 0x062E, 0x064A, 0x0635, 0x0649, 0x0635, 0x064A, 0x0636, 0x0649, 0x0636, 0x064A, 0x0634, 0x062C, + 0x0634, 0x062D, 0x0634, 0x062E, 0x0634, 0x0645, 0x0634, 0x0631, 0x0633, 0x0631, 0x0635, 0x0631, + 0x0636, 0x0631, 0x0634, 0x062C, 0x0634, 0x062D, 0x0634, 0x062E, 0x0634, 0x0645, 0x0633, 0x0647, + 0x0634, 0x0647, 0x0637, 0x0645, 0x0633, 0x062C, 0x0633, 0x062D, 0x0633, 0x062E, 0x0634, 0x062C, + 0x0634, 0x062D, 0x0634, 0x062E, 0x0637, 0x0645, 0x0638, 0x0645, 0x0627, 0x0627, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data217[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x062A, 0x062C, 0x0645, 0x062A, 0x062D, 0x062C, + 0x062A, 0x062D, 0x062C, 0x062A, 0x062D, 0x0645, 0x062A, 0x062E, 0x0645, 0x062A, 0x0645, + 0x062C, 0x062A, 0x0645, 0x062D, 0x062A, 0x0645, 0x062E, 0x062C, 0x0645, 0x062D, 0x062C, + 0x0645, 0x062D, 0x062D, 0x0645, 0x064A, 0x062D, 0x0645, 0x0649, 0x0633, 0x062D, 0x062C, + 0x0633, 0x062C, 0x062D, 0x0633, 0x062C, 0x0649, 0x0633, 0x0645, 0x062D}; +static unsigned short unicode_unacc_data218[] = { + 0x0633, 0x0645, 0x062D, 0x0633, 0x0645, 0x062C, 0x0633, 0x0645, 0x0645, 0x0633, 0x0645, 0x0645, + 0x0635, 0x062D, 0x062D, 0x0635, 0x062D, 0x062D, 0x0635, 0x0645, 0x0645, 0x0634, 0x062D, 0x0645, + 0x0634, 0x062D, 0x0645, 0x0634, 0x062C, 0x064A, 0x0634, 0x0645, 0x062E, 0x0634, 0x0645, 0x062E, + 0x0634, 0x0645, 0x0645, 0x0634, 0x0645, 0x0645, 0x0636, 0x062D, 0x0649, 0x0636, 0x062E, 0x0645, + 0x0636, 0x062E, 0x0645, 0x0637, 0x0645, 0x062D, 0x0637, 0x0645, 0x062D, 0x0637, 0x0645, 0x0645, + 0x0637, 0x0645, 0x064A, 0x0639, 0x062C, 0x0645, 0x0639, 0x0645, 0x0645, 0x0639, 0x0645, 0x0645, + 0x0639, 0x0645, 0x0649, 0x063A, 0x0645, 0x0645, 0x063A, 0x0645, 0x064A, 0x063A, 0x0645, 0x0649, + 0x0641, 0x062E, 0x0645, 0x0641, 0x062E, 0x0645, 0x0642, 0x0645, 0x062D, 0x0642, 0x0645, 0x0645}; +static unsigned short unicode_unacc_data219[] = { + 0x0644, 0x062D, 0x0645, 0x0644, 0x062D, 0x064A, 0x0644, 0x062D, 0x0649, 0x0644, 0x062C, 0x062C, + 0x0644, 0x062C, 0x062C, 0x0644, 0x062E, 0x0645, 0x0644, 0x062E, 0x0645, 0x0644, 0x0645, 0x062D, + 0x0644, 0x0645, 0x062D, 0x0645, 0x062D, 0x062C, 0x0645, 0x062D, 0x0645, 0x0645, 0x062D, 0x064A, + 0x0645, 0x062C, 0x062D, 0x0645, 0x062C, 0x0645, 0x0645, 0x062E, 0x062C, 0x0645, 0x062E, 0x0645, + 0xFFFF, 0xFFFF, 0x0645, 0x062C, 0x062E, 0x0647, 0x0645, 0x062C, 0x0647, 0x0645, 0x0645, 0x0646, + 0x062D, 0x0645, 0x0646, 0x062D, 0x0649, 0x0646, 0x062C, 0x0645, 0x0646, 0x062C, 0x0645, 0x0646, + 0x062C, 0x0649, 0x0646, 0x0645, 0x064A, 0x0646, 0x0645, 0x0649, 0x064A, 0x0645, 0x0645, 0x064A, + 0x0645, 0x0645, 0x0628, 0x062E, 0x064A, 0x062A, 0x062C, 0x064A}; +static unsigned short unicode_unacc_data220[] = { + 0x062A, 0x062C, 0x0649, 0x062A, 0x062E, 0x064A, 0x062A, 0x062E, 0x0649, 0x062A, 0x0645, 0x064A, + 0x062A, 0x0645, 0x0649, 0x062C, 0x0645, 0x064A, 0x062C, 0x062D, 0x0649, 0x062C, 0x0645, 0x0649, + 0x0633, 0x062E, 0x0649, 0x0635, 0x062D, 0x064A, 0x0634, 0x062D, 0x064A, 0x0636, 0x062D, 0x064A, + 0x0644, 0x062C, 0x064A, 0x0644, 0x0645, 0x064A, 0x064A, 0x062D, 0x064A, 0x064A, 0x062C, 0x064A, + 0x064A, 0x0645, 0x064A, 0x0645, 0x0645, 0x064A, 0x0642, 0x0645, 0x064A, 0x0646, 0x062D, 0x064A, + 0x0642, 0x0645, 0x062D, 0x0644, 0x062D, 0x0645, 0x0639, 0x0645, 0x064A, 0x0643, 0x0645, 0x064A, + 0x0646, 0x062C, 0x062D, 0x0645, 0x062E, 0x064A, 0x0644, 0x062C, 0x0645, 0x0643, 0x0645, 0x0645, + 0x0644, 0x062C, 0x0645, 0x0646, 0x062C, 0x062D, 0x062C, 0x062D, 0x064A, 0x062D, 0x062C, 0x064A}; +static unsigned short unicode_unacc_data221[] = { + 0x0645, 0x062C, 0x064A, 0x0641, 0x0645, 0x064A, 0x0628, 0x062D, 0x064A, 0x0643, 0x0645, 0x0645, + 0x0639, 0x062C, 0x0645, 0x0635, 0x0645, 0x0645, 0x0633, 0x062E, 0x064A, 0x0646, 0x062C, 0x064A, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data222[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0635, 0x0644, 0x06D2, 0x0642, 0x0644, 0x06D2, + 0x0627, 0x0644, 0x0644, 0x0647, 0x0627, 0x0643, 0x0628, 0x0631, 0x0645, 0x062D, 0x0645, + 0x062F, 0x0635, 0x0644, 0x0639, 0x0645, 0x0631, 0x0633, 0x0648, 0x0644, 0x0639, 0x0644, + 0x064A, 0x0647, 0x0648, 0x0633, 0x0644, 0x0645, 0x0635, 0x0644, 0x0649, 0x0635, 0x0644, + 0x0649, 0x0020, 0x0627, 0x0644, 0x0644, 0x0647, 0x0020, 0x0639, 0x0644, 0x064A, 0x0647, + 0x0020, 0x0648, 0x0633, 0x0644, 0x0645, 0x062C, 0x0644, 0x0020, 0x062C, 0x0644, 0x0627, + 0x0644, 0x0647, 0x0631, 0x06CC, 0x0627, 0x0644, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data223[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x002C, 0x3001, 0x3002, 0x003A, 0x003B, 0x0021, 0x003F, 0x3016, + 0x3017, 0x002E, 0x002E, 0x002E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data224[] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x002E, 0x002E, 0x2014, 0x2013, 0x005F, 0x005F, + 0x0028, 0x0029, 0x007B, 0x007D, 0x3014, 0x3015, 0x3010, 0x3011, 0x300A, 0x300B, 0x3008}; +static unsigned short unicode_unacc_data225[] = { + 0x3009, 0x300C, 0x300D, 0x300E, 0x300F, 0xFFFF, 0xFFFF, 0x005B, 0x005D, 0x0020, 0x0020, + 0x0020, 0x0020, 0x005F, 0x005F, 0x005F, 0x002C, 0x3001, 0x002E, 0xFFFF, 0x003B, 0x003A, + 0x003F, 0x0021, 0x2014, 0x0028, 0x0029, 0x007B, 0x007D, 0x3014, 0x3015, 0x0023}; +static unsigned short unicode_unacc_data226[] = { + 0x0026, 0x002A, 0x002B, 0x002D, 0x003C, 0x003E, 0x003D, 0xFFFF, 0x005C, 0x0024, 0x0025, + 0x0040, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0020, 0x0640, 0x0020, 0xFFFF, 0x0020, 0xFFFF, + 0x0020, 0x0640, 0x0020, 0x0640, 0x0020, 0x0640, 0x0020, 0x0640, 0x0020, 0x0640}; +static unsigned short unicode_unacc_data227[] = { + 0x0621, 0x0627, 0x0627, 0x0627, 0x0627, 0x0648, 0x0648, 0x0627, 0x0627, 0x064A, 0x064A, + 0x064A, 0x064A, 0x0627, 0x0627, 0x0628, 0x0628, 0x0628, 0x0628, 0x0629, 0x0629, 0x062A, + 0x062A, 0x062A, 0x062A, 0x062B, 0x062B, 0x062B, 0x062B, 0x062C, 0x062C, 0x062C}; +static unsigned short unicode_unacc_data228[] = { + 0x062C, 0x062D, 0x062D, 0x062D, 0x062D, 0x062E, 0x062E, 0x062E, 0x062E, 0x062F, 0x062F, + 0x0630, 0x0630, 0x0631, 0x0631, 0x0632, 0x0632, 0x0633, 0x0633, 0x0633, 0x0633, 0x0634, + 0x0634, 0x0634, 0x0634, 0x0635, 0x0635, 0x0635, 0x0635, 0x0636, 0x0636, 0x0636}; +static unsigned short unicode_unacc_data229[] = { + 0x0636, 0x0637, 0x0637, 0x0637, 0x0637, 0x0638, 0x0638, 0x0638, 0x0638, 0x0639, 0x0639, + 0x0639, 0x0639, 0x063A, 0x063A, 0x063A, 0x063A, 0x0641, 0x0641, 0x0641, 0x0641, 0x0642, + 0x0642, 0x0642, 0x0642, 0x0643, 0x0643, 0x0643, 0x0643, 0x0644, 0x0644, 0x0644}; +static unsigned short unicode_unacc_data230[] = { + 0x0644, 0x0645, 0x0645, 0x0645, 0x0645, 0x0646, 0x0646, 0x0646, 0x0646, 0x0647, + 0x0647, 0x0647, 0x0647, 0x0648, 0x0648, 0x0649, 0x0649, 0x064A, 0x064A, 0x064A, + 0x064A, 0x0644, 0x0627, 0x0644, 0x0627, 0x0644, 0x0627, 0x0644, 0x0627, 0x0644, + 0x0627, 0x0644, 0x0627, 0x0644, 0x0627, 0x0644, 0x0627, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data231[] = { + 0xFFFF, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, + 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, + 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F}; +static unsigned short unicode_unacc_data232[] = { + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, + 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F}; +static unsigned short unicode_unacc_data233[] = { + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, + 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, + 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2985}; +static unsigned short unicode_unacc_data234[] = { + 0x2986, 0x3002, 0x300C, 0x300D, 0x3001, 0x30FB, 0x30F2, 0x30A1, 0x30A3, 0x30A5, 0x30A7, + 0x30A9, 0x30E3, 0x30E5, 0x30E7, 0x30C3, 0x30FC, 0x30A2, 0x30A4, 0x30A6, 0x30A8, 0x30AA, + 0x30AB, 0x30AD, 0x30AF, 0x30B1, 0x30B3, 0x30B5, 0x30B7, 0x30B9, 0x30BB, 0x30BD}; +static unsigned short unicode_unacc_data235[] = { + 0x30BF, 0x30C1, 0x30C4, 0x30C6, 0x30C8, 0x30CA, 0x30CB, 0x30CC, 0x30CD, 0x30CE, 0x30CF, + 0x30D2, 0x30D5, 0x30D8, 0x30DB, 0x30DE, 0x30DF, 0x30E0, 0x30E1, 0x30E2, 0x30E4, 0x30E6, + 0x30E8, 0x30E9, 0x30EA, 0x30EB, 0x30EC, 0x30ED, 0x30EF, 0x30F3, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data236[] = { + 0x1160, 0x1100, 0x1101, 0x11AA, 0x1102, 0x11AC, 0x11AD, 0x1103, 0x1104, 0x1105, 0x11B0, + 0x11B1, 0x11B2, 0x11B3, 0x11B4, 0x11B5, 0x111A, 0x1106, 0x1107, 0x1108, 0x1121, 0x1109, + 0x110A, 0x110B, 0x110C, 0x110D, 0x110E, 0x110F, 0x1110, 0x1111, 0x1112, 0xFFFF}; +static unsigned short unicode_unacc_data237[] = { + 0xFFFF, 0xFFFF, 0x1161, 0x1162, 0x1163, 0x1164, 0x1165, 0x1166, 0xFFFF, 0xFFFF, 0x1167, + 0x1168, 0x1169, 0x116A, 0x116B, 0x116C, 0xFFFF, 0xFFFF, 0x116D, 0x116E, 0x116F, 0x1170, + 0x1171, 0x1172, 0xFFFF, 0xFFFF, 0x1173, 0x1174, 0x1175, 0xFFFF, 0xFFFF, 0xFFFF}; +static unsigned short unicode_unacc_data238[] = { + 0x00A2, 0x00A3, 0x00AC, 0x0020, 0x00A6, 0x00A5, 0x20A9, 0xFFFF, 0x2502, 0x2190, 0x2191, + 0x2192, 0x2193, 0x25A0, 0x25CB, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; + +static unsigned short* unicode_unacc_data_table[UNICODE_UNACC_BLOCK_COUNT] = { + unicode_unacc_data0, unicode_unacc_data1, unicode_unacc_data2, unicode_unacc_data3, + unicode_unacc_data4, unicode_unacc_data5, unicode_unacc_data6, unicode_unacc_data7, + unicode_unacc_data8, unicode_unacc_data9, unicode_unacc_data10, unicode_unacc_data11, + unicode_unacc_data12, unicode_unacc_data13, unicode_unacc_data14, unicode_unacc_data15, + unicode_unacc_data16, unicode_unacc_data17, unicode_unacc_data18, unicode_unacc_data19, + unicode_unacc_data20, unicode_unacc_data21, unicode_unacc_data22, unicode_unacc_data23, + unicode_unacc_data24, unicode_unacc_data25, unicode_unacc_data26, unicode_unacc_data27, + unicode_unacc_data28, unicode_unacc_data29, unicode_unacc_data30, unicode_unacc_data31, + unicode_unacc_data32, unicode_unacc_data33, unicode_unacc_data34, unicode_unacc_data35, + unicode_unacc_data36, unicode_unacc_data37, unicode_unacc_data38, unicode_unacc_data39, + unicode_unacc_data40, unicode_unacc_data41, unicode_unacc_data42, unicode_unacc_data43, + unicode_unacc_data44, unicode_unacc_data45, unicode_unacc_data46, unicode_unacc_data47, + unicode_unacc_data48, unicode_unacc_data49, unicode_unacc_data50, unicode_unacc_data51, + unicode_unacc_data52, unicode_unacc_data53, unicode_unacc_data54, unicode_unacc_data55, + unicode_unacc_data56, unicode_unacc_data57, unicode_unacc_data58, unicode_unacc_data59, + unicode_unacc_data60, unicode_unacc_data61, unicode_unacc_data62, unicode_unacc_data63, + unicode_unacc_data64, unicode_unacc_data65, unicode_unacc_data66, unicode_unacc_data67, + unicode_unacc_data68, unicode_unacc_data69, unicode_unacc_data70, unicode_unacc_data71, + unicode_unacc_data72, unicode_unacc_data73, unicode_unacc_data74, unicode_unacc_data75, + unicode_unacc_data76, unicode_unacc_data77, unicode_unacc_data78, unicode_unacc_data79, + unicode_unacc_data80, unicode_unacc_data81, unicode_unacc_data82, unicode_unacc_data83, + unicode_unacc_data84, unicode_unacc_data85, unicode_unacc_data86, unicode_unacc_data87, + unicode_unacc_data88, unicode_unacc_data89, unicode_unacc_data90, unicode_unacc_data91, + unicode_unacc_data92, unicode_unacc_data93, unicode_unacc_data94, unicode_unacc_data95, + unicode_unacc_data96, unicode_unacc_data97, unicode_unacc_data98, unicode_unacc_data99, + unicode_unacc_data100, unicode_unacc_data101, unicode_unacc_data102, unicode_unacc_data103, + unicode_unacc_data104, unicode_unacc_data105, unicode_unacc_data106, unicode_unacc_data107, + unicode_unacc_data108, unicode_unacc_data109, unicode_unacc_data110, unicode_unacc_data111, + unicode_unacc_data112, unicode_unacc_data113, unicode_unacc_data114, unicode_unacc_data115, + unicode_unacc_data116, unicode_unacc_data117, unicode_unacc_data118, unicode_unacc_data119, + unicode_unacc_data120, unicode_unacc_data121, unicode_unacc_data122, unicode_unacc_data123, + unicode_unacc_data124, unicode_unacc_data125, unicode_unacc_data126, unicode_unacc_data127, + unicode_unacc_data128, unicode_unacc_data129, unicode_unacc_data130, unicode_unacc_data131, + unicode_unacc_data132, unicode_unacc_data133, unicode_unacc_data134, unicode_unacc_data135, + unicode_unacc_data136, unicode_unacc_data137, unicode_unacc_data138, unicode_unacc_data139, + unicode_unacc_data140, unicode_unacc_data141, unicode_unacc_data142, unicode_unacc_data143, + unicode_unacc_data144, unicode_unacc_data145, unicode_unacc_data146, unicode_unacc_data147, + unicode_unacc_data148, unicode_unacc_data149, unicode_unacc_data150, unicode_unacc_data151, + unicode_unacc_data152, unicode_unacc_data153, unicode_unacc_data154, unicode_unacc_data155, + unicode_unacc_data156, unicode_unacc_data157, unicode_unacc_data158, unicode_unacc_data159, + unicode_unacc_data160, unicode_unacc_data161, unicode_unacc_data162, unicode_unacc_data163, + unicode_unacc_data164, unicode_unacc_data165, unicode_unacc_data166, unicode_unacc_data167, + unicode_unacc_data168, unicode_unacc_data169, unicode_unacc_data170, unicode_unacc_data171, + unicode_unacc_data172, unicode_unacc_data173, unicode_unacc_data174, unicode_unacc_data175, + unicode_unacc_data176, unicode_unacc_data177, unicode_unacc_data178, unicode_unacc_data179, + unicode_unacc_data180, unicode_unacc_data181, unicode_unacc_data182, unicode_unacc_data183, + unicode_unacc_data184, unicode_unacc_data185, unicode_unacc_data186, unicode_unacc_data187, + unicode_unacc_data188, unicode_unacc_data189, unicode_unacc_data190, unicode_unacc_data191, + unicode_unacc_data192, unicode_unacc_data193, unicode_unacc_data194, unicode_unacc_data195, + unicode_unacc_data196, unicode_unacc_data197, unicode_unacc_data198, unicode_unacc_data199, + unicode_unacc_data200, unicode_unacc_data201, unicode_unacc_data202, unicode_unacc_data203, + unicode_unacc_data204, unicode_unacc_data205, unicode_unacc_data206, unicode_unacc_data207, + unicode_unacc_data208, unicode_unacc_data209, unicode_unacc_data210, unicode_unacc_data211, + unicode_unacc_data212, unicode_unacc_data213, unicode_unacc_data214, unicode_unacc_data215, + unicode_unacc_data216, unicode_unacc_data217, unicode_unacc_data218, unicode_unacc_data219, + unicode_unacc_data220, unicode_unacc_data221, unicode_unacc_data222, unicode_unacc_data223, + unicode_unacc_data224, unicode_unacc_data225, unicode_unacc_data226, unicode_unacc_data227, + unicode_unacc_data228, unicode_unacc_data229, unicode_unacc_data230, unicode_unacc_data231, + unicode_unacc_data232, unicode_unacc_data233, unicode_unacc_data234, unicode_unacc_data235, + unicode_unacc_data236, unicode_unacc_data237, unicode_unacc_data238}; +/* Generated by builder. Do not modify. End unicode_unacc_tables */ + +#define unicode_unacc(c, p, l) \ + { \ + unsigned short index = unicode_unacc_indexes[(c) >> UNICODE_UNACC_BLOCK_SHIFT]; \ + unsigned char position = (c)&UNICODE_UNACC_BLOCK_MASK; \ + (p) = &(unicode_unacc_data_table[index][unicode_unacc_positions[index][position]]); \ + (l) = unicode_unacc_positions[index][position + 1] - \ + unicode_unacc_positions[index][position]; \ + if ((l) == 1 && *(p) == 0xFFFF) { \ + (p) = 0; \ + (l) = 0; \ + } \ + } +SQLITE_EXPORT u16 sqlite3_unicode_unacc(u16 c, u16** p, int* l) { + if (c < 0x80) { + if (l) { + *l = 1; + *p = &c; + } + return c; + } else { + unsigned short index = unicode_unacc_indexes[(c) >> UNICODE_UNACC_BLOCK_SHIFT]; + unsigned char position = (c)&UNICODE_UNACC_BLOCK_MASK; + unsigned short length = + unicode_unacc_positions[index][position + 1] - unicode_unacc_positions[index][position]; + unsigned short* pointer = + &(unicode_unacc_data_table[index][unicode_unacc_positions[index][position]]); + + if (l) { + *l = length; + *p = pointer; + } + return ((length == 1) && (*pointer == 0xFFFF)) ? c : *pointer; + } +} +#endif +/************************************************************************************************* + ************************************************************************************************* + *************************************************************************************************/ + +/* +** Check to see if this machine uses EBCDIC. (Yes, believe it or +** not, there are still machines out there that use EBCDIC.) +*/ +#if 'A' == '\301' +#define SQLITE_EBCDIC 1 +#else +#define SQLITE_ASCII 1 +#endif + +/* +** Assuming zIn points to the first byte of a UTF-8 character, +** advance zIn to point to the first byte of the next UTF-8 character. +*/ +#define SQLITE_SKIP_UTF8(zIn) \ + { \ + if ((*(zIn++)) >= 0xc0) { \ + while ((*zIn & 0xc0) == 0x80) { \ + zIn++; \ + } \ + } \ + } + +/* +** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, +** return the number of unicode characters in pZ up to (but not including) +** the first 0x00 byte. If nByte is not less than zero, return the +** number of unicode characters in the first nByte of pZ (or up to +** the first 0x00, whichever comes first). +*/ +SQLITE_PRIVATE int sqlite3Utf8CharLen(const char* zIn, int nByte) { + int r = 0; + const u8* z = (const u8*)zIn; + const u8* zTerm; + if (nByte >= 0) { + zTerm = &z[nByte]; + } else { + zTerm = (const u8*)(-1); + } + assert(z <= zTerm); + while (*z != 0 && z < zTerm) { + SQLITE_SKIP_UTF8(z); + r++; + } + return r; +} + +/* +** This lookup table is used to help decode the first byte of +** a multi-byte UTF8 character. +*/ +static const unsigned char unicode_utf8_lookup[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, +}; + +/* +** Translate a single UTF-8 character. Return the unicode value. +** +** During translation, assume that the byte that zTerm points +** is a 0x00. +** +** Write a pointer to the next unread byte back into *pzNext. +** +** Notes On Invalid UTF-8: +** +** * This routine never allows a 7-bit character (0x00 through 0x7f) to +** be encoded as a multi-byte character. Any multi-byte character that +** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd. +** +** * This routine never allows a UTF16 surrogate value to be encoded. +** If a multi-byte character attempts to encode a value between +** 0xd800 and 0xe000 then it is rendered as 0xfffd. +** +** * Bytes in the range of 0x80 through 0xbf which occur as the first +** byte of a character are interpreted as single-byte characters +** and rendered as themselves even though they are technically +** invalid characters. +** +** * This routine accepts an infinite number of different UTF8 encodings +** for unicode values 0x80 and greater. It do not change over-length +** encodings to 0xfffd as some systems recommend. +*/ +#define READ_UTF8(zIn, zTerm, c) \ + c = *(zIn++); \ + if (c >= 0xc0) { \ + c = unicode_utf8_lookup[c - 0xc0]; \ + while (zIn != zTerm && (*zIn & 0xc0) == 0x80) { \ + c = (c << 6) + (0x3f & *(zIn++)); \ + } \ + if (c < 0x80 || (c & 0xFFFFF800) == 0xD800 || (c & 0xFFFFFFFE) == 0xFFFE) { \ + c = 0xFFFD; \ + } \ + } +SQLITE_PRIVATE int sqlite3Utf8Read( + const unsigned char* z, /* First byte of UTF-8 character */ + const unsigned char* zTerm, /* Pretend this byte is 0x00 */ + const unsigned char** pzNext /* Write first byte past UTF-8 char here */ +) { + int c; + READ_UTF8(z, zTerm, c); + *pzNext = z; + return c; +} + +/* An array to map all upper-case characters into their corresponding +** lower-case character. +** +** SQLite only considers US-ASCII (or EBCDIC) characters. We do not +** handle case conversions for the UTF character set since the tables +** involved are nearly as big or bigger than SQLite itself. +*/ +SQLITE_PRIVATE const unsigned char sqlite3UpperToLower[] = { +#ifdef SQLITE_ASCII + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255 +#endif +#ifdef SQLITE_EBCDIC + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, /* 0x */ + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, /* 1x */ + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, /* 2x */ + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, /* 3x */ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, /* 4x */ + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, /* 5x */ + 96, + 97, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 106, + 107, + 108, + 109, + 110, + 111, /* 6x */ + 112, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 122, + 123, + 124, + 125, + 126, + 127, /* 7x */ + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, /* 8x */ + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 156, + 159, /* 9x */ + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 140, + 141, + 142, + 175, /* Ax */ + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, /* Bx */ + 192, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 202, + 203, + 204, + 205, + 206, + 207, /* Cx */ + 208, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 218, + 219, + 220, + 221, + 222, + 223, /* Dx */ + 224, + 225, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 232, + 203, + 204, + 205, + 206, + 207, /* Ex */ + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 219, + 220, + 221, + 222, + 255, /* Fx */ +#endif +}; + +/* +** For LIKE and GLOB matching on EBCDIC machines, assume that every +** character is exactly one byte in size. Also, all characters are +** able to participate in upper-case-to-lower-case mappings in EBCDIC +** whereas only characters less than 0x80 do in ASCII. +*/ +/* +** +** The buit-in function has been extended to accomodate UTF-8 and UTF-16 +** unicode strings containing characters over the 0x80 character limit as +** per the ASCII encoding imposed by SQlite. +** +** The functions below will use the sqlite3_unicode_fold() when +** SQLITE3_UNICODE_FOLD is defined and additonally sqlite_unicode_unacc() +** when SQLITE3_UNICODE_UNACC_AUTOMATIC is defined to normilize +** UTF-8 and UTF-16 encoded strings. +*/ +#if defined(SQLITE_EBCDIC) +#define sqlite3Utf8Read(A, B, C) (*(A++)) +#define GlogUpperToLower(A) A = sqlite3UpperToLower[A] +#else +#if defined(SQLITE3_UNICODE_UNACC) && defined(SQLITE3_UNICODE_UNACC_AUTOMATIC) && \ + defined(SQLITE3_UNICODE_FOLD) +#define GlogUpperToLower(A) A = sqlite3_unicode_fold(sqlite3_unicode_unacc(A, 0, 0)) +#elif defined(SQLITE3_UNICODE_FOLD) +#define GlogUpperToLower(A) A = sqlite3_unicode_fold(A) +#else +#define GlogUpperToLower(A) \ + if (A < 0x80) { \ + A = sqlite3UpperToLower[A]; \ + } +#endif +#endif + +/* +** Maximum length (in bytes) of the pattern in a LIKE or GLOB +** operator. +*/ +#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH +#define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000 +#endif + +/* +** A structure defining how to do GLOB-style comparisons. +*/ +struct compareInfo { + u8 matchAll; + u8 matchOne; + u8 matchSet; + u8 noCase; +}; + +/* The correct SQL-92 behavior is for the LIKE operator to ignore +** case. Thus 'a' LIKE 'A' would be true. */ +static const struct compareInfo likeInfoNorm = {'%', '_', 0, 1}; + +/* +** Compare two UTF-8 strings for equality where the first string can +** potentially be a "glob" expression. Return true (1) if they +** are the same and false (0) if they are different. +** +** Globbing rules: +** +** '*' Matches any sequence of zero or more characters. +** +** '?' Matches exactly one character. +** +** [...] Matches one character from the enclosed list of +** characters. +** +** [^...] Matches one character not in the enclosed list. +** +** With the [...] and [^...] matching, a ']' character can be included +** in the list by making it the first character after '[' or '^'. A +** range of characters can be specified using '-'. Example: +** "[a-z]" matches any single lower-case letter. To match a '-', make +** it the last character in the list. +** +** This routine is usually quick, but can be N**2 in the worst case. +** +** Hints: to match '*' or '?', put them in "[]". Like this: +** +** abc[*]xyz Matches "abc*xyz" only +*/ +static int patternCompare( + const u8* zPattern, /* The glob pattern */ + const u8* zString, /* The string to compare against the glob */ + const struct compareInfo* pInfo, /* Information about how to do the compare */ + const int esc /* The escape character */ +) { + int c, c2; + int invert; + int seen; + u8 matchOne = pInfo->matchOne; + u8 matchAll = pInfo->matchAll; + u8 matchSet = pInfo->matchSet; + u8 noCase = pInfo->noCase; + int prevEscape = 0; /* True if the previous character was 'escape' */ + + while ((c = sqlite3Utf8Read(zPattern, 0, &zPattern)) != 0) { + if (!prevEscape && c == matchAll) { + while ((c = sqlite3Utf8Read(zPattern, 0, &zPattern)) == matchAll || c == matchOne) { + if (c == matchOne && sqlite3Utf8Read(zString, 0, &zString) == 0) { + return 0; + } + } + if (c == 0) { + return 1; + } else if (c == esc) { + c = sqlite3Utf8Read(zPattern, 0, &zPattern); + if (c == 0) { + return 0; + } + } else if (c == matchSet) { + assert(esc == 0); /* This is GLOB, not LIKE */ + assert(matchSet < 0x80); /* '[' is a single-byte character */ + while (*zString && patternCompare(&zPattern[-1], zString, pInfo, esc) == 0) { + SQLITE_SKIP_UTF8(zString); + } + return *zString != 0; + } + while ((c2 = sqlite3Utf8Read(zString, 0, &zString)) != 0) { + if (noCase) { + GlogUpperToLower(c2); + GlogUpperToLower(c); + while (c2 != 0 && c2 != c) { + c2 = sqlite3Utf8Read(zString, 0, &zString); + GlogUpperToLower(c2); + } + } else { + while (c2 != 0 && c2 != c) { + c2 = sqlite3Utf8Read(zString, 0, &zString); + } + } + if (c2 == 0) + return 0; + if (patternCompare(zPattern, zString, pInfo, esc)) + return 1; + } + return 0; + } else if (!prevEscape && c == matchOne) { + if (sqlite3Utf8Read(zString, 0, &zString) == 0) { + return 0; + } + } else if (c == matchSet) { + int prior_c = 0; + assert(esc == 0); /* This only occurs for GLOB, not LIKE */ + seen = 0; + invert = 0; + c = sqlite3Utf8Read(zString, 0, &zString); + if (c == 0) + return 0; + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + if (c2 == '^') { + invert = 1; + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + } + if (c2 == ']') { + if (c == ']') + seen = 1; + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + } + while (c2 && c2 != ']') { + if (c2 == '-' && zPattern[0] != ']' && zPattern[0] != 0 && prior_c > 0) { + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + if (c >= prior_c && c <= c2) + seen = 1; + prior_c = 0; + } else { + if (c == c2) { + seen = 1; + } + prior_c = c2; + } + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + } + if (c2 == 0 || (seen ^ invert) == 0) { + return 0; + } + } else if (esc == c && !prevEscape) { + prevEscape = 1; + } else { + c2 = sqlite3Utf8Read(zString, 0, &zString); + if (noCase) { + GlogUpperToLower(c); + GlogUpperToLower(c2); + } + if (c != c2) { + return 0; + } + prevEscape = 0; + } + } + return *zString == 0; +} + +/* +** Count the number of times that the LIKE operator (or GLOB which is +** just a variation of LIKE) gets called. This is used for testing +** only. +*/ +#ifdef SQLITE_TEST +SQLITE_API int sqlite3_like_count = 0; +#endif + +/* +** Implementation of the like() SQL function. This function implements +** the build-in LIKE operator. The first argument to the function is the +** pattern and the second argument is the string. So, the SQL statements: +** +** A LIKE B +** +** is implemented as like(B,A). +** +** This same function (with a different compareInfo structure) computes +** the GLOB operator. +*/ +static void likeFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + const unsigned char *zA, *zB; + int escape = 0; +#if 0 + sqlite3 *db = sqlite3_context_db_handle(context);*/ +#endif + zB = sqlite3_value_text(argv[0]); + zA = sqlite3_value_text(argv[1]); + + /* Limit the length of the LIKE or GLOB pattern to avoid problems + ** of deep recursion and N*N behavior in patternCompare(). + */ +#if 0 + if( sqlite3_value_bytes(argv[0]) > + db->aLimit[SQLITE_LIMIT_LIKE_PATTERN_LENGTH] ){ +#endif +#if 1 + if (sqlite3_value_bytes(argv[0]) > SQLITE_MAX_LIKE_PATTERN_LENGTH) { +#endif + sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1); + return; + } + + assert(zB == sqlite3_value_text(argv[0])); /* Encoding did not change */ + + if (argc == 3) { + /* The escape character string must consist of a single UTF-8 character. + ** Otherwise, return an error. + */ + const unsigned char* zEsc = sqlite3_value_text(argv[2]); + if (zEsc == 0) + return; + if (sqlite3Utf8CharLen((char*)zEsc, -1) != 1) { + sqlite3_result_error(context, "ESCAPE expression must be a single character", -1); + return; + } + escape = sqlite3Utf8Read(zEsc, 0, &zEsc); + } + if (zA && zB) { + struct compareInfo* pInfo = sqlite3_user_data(context); +#ifdef SQLITE_TEST + sqlite3_like_count++; +#endif + + sqlite3_result_int(context, patternCompare(zB, zA, pInfo, escape)); + } +} + +/* +** Allocate nByte bytes of space using sqlite3_malloc(). If the +** allocation fails, call sqlite3_result_error_nomem() to notify +** the database handle that malloc() has failed. +*/ +static void* contextMalloc(sqlite3_context* context, i64 nByte) { + char* z; +#if 0 + if( nByte>sqlite3_context_db_handle(context)->aLimit[SQLITE_LIMIT_LENGTH] ){ + sqlite3_result_error_toobig(context); + z = 0; + }else{ +#endif + z = sqlite3_malloc((int)nByte); + if (!z && nByte > 0) { + sqlite3_result_error_nomem(context); + } +#if 0 + } +#endif + return z; +} + +/* +** +** Reallocate nByte bytes of space using sqlite3_realloc(). If the +** allocation fails, call sqlite3_result_error_nomem() to notify +** the database handle that malloc() has failed. +** +** SQlite has not supplied us with a reallocate function so we build our own. +*/ +SQLITE_PRIVATE void* contextRealloc(sqlite3_context* context, void* pPrior, i64 nByte) { + char* z = sqlite3_realloc(pPrior, (int)nByte); + if (!z && nByte > 0) { + sqlite3_result_error_nomem(context); + } + return z; +} + +#if (defined(SQLITE3_UNICODE_FOLD) || defined(SQLITE3_UNICODE_LOWER) || \ + defined(SQLITE3_UNICODE_UPPER) || defined(SQLITE3_UNICODE_TITLE)) +/* +** +** Implementation of the FOLD(), UPPER(), LOWER(), TITLE() SQL functions. +** This function case folds each character in the supplied string to its +** single character equivalent. +** +** The conversion to be made depends on the contents of (sqlite3_context *)context +** where a pointer to a specific case conversion function is stored. +*/ +SQLITE_PRIVATE void caseFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + u16* z1; + const u16* z2; + int i, n; + if (argc < 1 || SQLITE_NULL == sqlite3_value_type(argv[0])) + return; + z2 = (u16*)sqlite3_value_text16(argv[0]); + n = sqlite3_value_bytes16(argv[0]); + /* Verify that the call to _bytes() does not invalidate the _text() pointer */ + assert(z2 == (u16*)sqlite3_value_text16(argv[0])); + if (z2) { + z1 = contextMalloc(context, n + 2); + if (z1) { + typedef u16 (*PFN_CASEFUNC)(u16); + memcpy(z1, z2, n + 2); + for (i = 0; z1[i]; i++) { + z1[i] = ((PFN_CASEFUNC)sqlite3_user_data(context))(z1[i]); + } + sqlite3_result_text16(context, z1, -1, sqlite3_free); + } + } +} +#endif + +#ifdef SQLITE3_UNICODE_UNACC +/* +** +** Implementation of the UNACCENT() SQL function. +** This function decomposes each character in the supplied string +** to its components and strips any accents present in the string. +** +** This function may result to a longer output string compared +** to the original input string. Memory has been properly reallocated +** to accomodate for the extra memory length required. +*/ +SQLITE_PRIVATE void unaccFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + u16* z1; + const u16* z2; + unsigned short* p; + int i, o, n, l, k; + if (argc < 1 || SQLITE_NULL == sqlite3_value_type(argv[0])) + return; + z2 = (u16*)sqlite3_value_text16(argv[0]); + n = sqlite3_value_bytes16(argv[0]); + /* Verify that the call to _bytes() does not invalidate the _text() pointer */ + assert(z2 == (u16*)sqlite3_value_text16(argv[0])); + if (z2) { + z1 = contextMalloc(context, n + 2); + if (z1) { + memcpy(z1, z2, n + 2); + for (i = 0, o = 0; z2[i]; i++, o++) { + unicode_unacc(z2[i], p, l); + if (l > 0) { + if (l > 1) { + n += (l - 1) * sizeof(u16); + z1 = contextRealloc(context, z1, n + 2); + } + for (k = 0; k < l; k++) + z1[o + k] = p[k]; + o += --k; + } else + z1[o] = z2[i]; + } + z1[o] = 0; + sqlite3_result_text16(context, z1, -1, sqlite3_free); + } + } +} +#endif + +#if defined(SQLITE3_UNICODE_COLLATE) && defined(SQLITE3_UNICODE_FOLD) + +#ifndef max +#define max(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +/* +** Some systems have stricmp(). Others have strcasecmp(). Because +** there is no consistency, we will define our own. +*/ +/* +** +** The buit-in function has been extended to accomodate UTF-8 and UTF-16 +** unicode strings containing characters over the 0x80 character limit as +** per the ASCII encoding imposed by SQlite. +** +** The functions below will use the sqlite3_unicode_fold() when +** SQLITE3_UNICODE_FOLD is defined and additonally sqlite_unicode_unacc() +** when SQLITE3_UNICODE_UNACC_AUTOMATIC is defined to normilize +** UTF-8 and UTF-16 encoded strings and then compaire them for equality. +*/ +SQLITE_PRIVATE int sqlite3StrNICmp(const unsigned char* zLeft, const unsigned char* zRight, int N) { + const unsigned char* a = zLeft; + const unsigned char* b = zRight; + signed int ua = 0, ub = 0; + int Z = 0; + + do { + ua = sqlite3Utf8Read(a, 0, &a); + ub = sqlite3Utf8Read(b, 0, &b); + GlogUpperToLower(ua); + GlogUpperToLower(ub); + Z = (int)max(a - zLeft, b - zRight); + } while (N > Z && *a != 0 && ua == ub); + return N < 0 ? 0 : ua - ub; +} +SQLITE_PRIVATE int sqlite3StrNICmp16(const void* zLeft, const void* zRight, int N) { + const unsigned short* a = zLeft; + const unsigned short* b = zRight; + signed int ua = 0, ub = 0; + + do { + ua = *a; + ub = *b; + GlogUpperToLower(ua); + GlogUpperToLower(ub); + a++; + b++; + } while (--N > 0 && *a != 0 && ua == ub); + return N < 0 ? 0 : ua - ub; +} + +/* +** Another built-in collating sequence: NOCASE. +** +** This collating sequence is intended to be used for "case independant +** comparison". SQLite's knowledge of upper and lower case equivalents +** extends only to the 26 characters used in the English language. +** +** At the moment there is only a UTF-8 implementation. +*/ +/* +** +** The built-in collating sequence: NOCASE is extended to accomodate the +** unicode case folding mapping tables to normalize characters to their +** fold equivalents and test them for equality. +** +** Both UTF-8 and UTF-16 implementations are supported. +** +** (void *)encoding takes the following values +** * SQLITE_UTF8 for UTF-8 encoded string comparison +** * SQLITE_UFT16 for UTF-16 encoded string comparison +*/ +SQLITE_EXPORT int sqlite3_unicode_collate(void* encoding, + int nKey1, + const void* pKey1, + int nKey2, + const void* pKey2) { + (void)sqlite3UpperToLower; + int r = 0; + + if ((void*)SQLITE_UTF8 == encoding) + r = sqlite3StrNICmp((const unsigned char*)pKey1, (const unsigned char*)pKey2, + (nKey1 < nKey2) ? nKey1 : nKey2); + else if ((void*)SQLITE_UTF16 == encoding) + r = sqlite3StrNICmp16((const void*)pKey1, (const void*)pKey2, + (nKey1 < nKey2) ? nKey1 : nKey2); + + if (0 == r) { + r = nKey1 - nKey2; + } + return r; +} +#endif + +/* +** +** Implementation of the UNICODE_VERSION(*) function. The result is the version +** of the unicode library that is running. +*/ +SQLITE_PRIVATE void versionFunc(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLITE3_UNICODE_VERSION_STRING, -1, SQLITE_STATIC); +} + +/* +** +** Register the UNICODE extension functions with database db. +*/ +SQLITE_EXPORT int sqlite3_unicode_init_impl(sqlite3* db) { + static const int flags16 = SQLITE_UTF16 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + static const int flagsAny = SQLITE_ANY | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + + struct FuncScalar { + const char* zName; /* Function name */ + int nArg; /* Number of arguments */ + int enc; /* Optimal text encoding */ + void* pContext; /* sqlite3_user_data() context */ + void (*xFunc)(sqlite3_context*, int, sqlite3_value**); + } scalars[] = { + {"unicode_version", 0, flagsAny, 0, versionFunc}, + +#ifdef SQLITE3_UNICODE_FOLD + {"like", 2, flags16, (void*)&likeInfoNorm, likeFunc}, + {"nlike", 2, flagsAny, (void*)&likeInfoNorm, likeFunc}, + {"like", 3, flags16, (void*)&likeInfoNorm, likeFunc}, + {"nlike", 3, flagsAny, (void*)&likeInfoNorm, likeFunc}, + + {"casefold", 1, flagsAny, (void*)sqlite3_unicode_fold, caseFunc}, +#endif +#ifdef SQLITE3_UNICODE_LOWER + {"lower", 1, flags16, (void*)sqlite3_unicode_lower, caseFunc}, + {"nlower", 1, flagsAny, (void*)sqlite3_unicode_lower, caseFunc}, +#endif +#ifdef SQLITE3_UNICODE_UPPER + {"upper", 1, flags16, (void*)sqlite3_unicode_upper, caseFunc}, + {"nupper", 1, flagsAny, (void*)sqlite3_unicode_upper, caseFunc}, +#endif +#ifdef SQLITE3_UNICODE_TITLE + {"title", 1, flagsAny, (void*)sqlite3_unicode_title, caseFunc}, + {"ntitle", 1, flagsAny, (void*)sqlite3_unicode_title, caseFunc}, +#endif +#ifdef SQLITE3_UNICODE_UNACC + {"unaccent", 1, flagsAny, 0, unaccFunc}, +#endif + }; + + for (size_t i = 0; i < (sizeof(scalars) / sizeof(struct FuncScalar)); i++) { + struct FuncScalar* p = &scalars[i]; + sqlite3_create_function(db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0); + } + +#if defined(SQLITE3_UNICODE_COLLATE) && defined(SQLITE3_UNICODE_FOLD) + /* Also override the default NOCASE UTF-8 case-insensitive collation sequence. */ + sqlite3_create_collation(db, "NOCASE", SQLITE_UTF8, (void*)SQLITE_UTF8, + sqlite3_unicode_collate); + sqlite3_create_collation(db, "NOCASE", SQLITE_UTF16, (void*)SQLITE_UTF16, + sqlite3_unicode_collate); +#endif + + return SQLITE_OK; +} + +int unicode_init(sqlite3* db) { + sqlite3_unicode_init_impl(db); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/unicode/extension.h b/libsql-ffi/bundled/sqlean/unicode/extension.h new file mode 100644 index 0000000000..b68fc79e57 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/unicode/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Unicode support for SQLite. + +#ifndef UNICODE_EXTENSION_H +#define UNICODE_EXTENSION_H + +#include "sqlite3ext.h" + +int unicode_init(sqlite3* db); + +#endif /* UNICODE_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/uuid/extension.c b/libsql-ffi/bundled/sqlean/uuid/extension.c new file mode 100644 index 0000000000..a809cdd7c2 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/uuid/extension.c @@ -0,0 +1,276 @@ +// Originally from the uuid SQLite exension, Public Domain +// https://www.sqlite.org/src/file/ext/misc/uuid.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// Universally Unique IDentifiers (UUIDs) in SQLite + +/* + * This SQLite extension implements functions that handling RFC-4122 for UUIDv4 + * and RFC-9562 for UUIDv7 + * + * Five SQL functions are implemented: + * + * uuid4() - generate a version 4 UUID as a string + * uuid7() - generate a version 7 UUID as a string + * uuid_str(X) - convert a UUID X into a well-formed UUID string + * uuid_blob(X) - convert a UUID X into a 16-byte blob + * uuid7_timestamp_ms(X) - extract unix timestamp in miliseconds + * from version 7 UUID X. + * + * The output from uuid4(), uuid7() and uuid_str(X) are always well-formed RFC-4122 + * UUID strings in this format: + * + * xxxxxxxx-xxxx-Mxxx-Nxxx-xxxxxxxxxxxx + * + * All of the 'x', 'M', and 'N' values are lower-case hexadecimal digits. + * The M digit indicates the "version". For uuid4()-generated UUIDs, the + * version is always "4" (a random UUID). The upper three bits of N digit + * are the "variant". This library only supports variant 1 (indicated + * by values of N between '8' and 'b') as those are overwhelming the most + * common. Other variants are for legacy compatibility only. + * + * The output of uuid_blob(X) is always a 16-byte blob. The UUID input + * string is converted in network byte order (big-endian) in accordance + * with RFC-4122 specifications for variant-1 UUIDs. Note that network + * byte order is *always* used, even if the input self-identifies as a + * variant-2 UUID. + * + * The input X to the uuid_str() and uuid_blob() functions can be either + * a string or a BLOB. If it is a BLOB it must be exactly 16 bytes in + * length or else a NULL is returned. If the input is a string it must + * consist of 32 hexadecimal digits, upper or lower case, optionally + * surrounded by {...} and with optional "-" characters interposed in the + * middle. The flexibility of input is inspired by the PostgreSQL + * implementation of UUID functions that accept in all of the following + * formats: + * + * A0EEBC99-9C0B-4EF8-BB6D-6BB9BD380A11 + * {a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11} + * a0eebc999c0b4ef8bb6d6bb9bd380a11 + * a0ee-bc99-9c0b-4ef8-bb6d-6bb9-bd38-0a11 + * {a0eebc99-9c0b4ef8-bb6d6bb9-bd380a11} + * + * If any of the above inputs are passed into uuid_str(), the output will + * always be in the canonical RFC-4122 format: + * + * a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11 + * + * If the X input string has too few or too many digits or contains + * stray characters other than {, }, or -, then NULL is returned. + */ +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#if !defined(SQLITE_ASCII) && !defined(SQLITE_EBCDIC) +#define SQLITE_ASCII 1 +#endif + +/* + * Translate a single byte of Hex into an integer. + * This routine only works if h really is a valid hexadecimal + * character: 0..9a..fA..F + */ +static unsigned char sqlite3UuidHexToInt(int h) { + assert((h >= '0' && h <= '9') || (h >= 'a' && h <= 'f') || (h >= 'A' && h <= 'F')); +#ifdef SQLITE_ASCII + h += 9 * (1 & (h >> 6)); +#endif +#ifdef SQLITE_EBCDIC + h += 9 * (1 & ~(h >> 4)); +#endif + return (unsigned char)(h & 0xf); +} + +/* + * Convert a 16-byte BLOB into a well-formed RFC-4122 UUID. The output + * buffer zStr should be at least 37 bytes in length. The output will + * be zero-terminated. + */ +static void sqlite3_uuid_blob_to_str(const unsigned char* aBlob, /* Input blob */ + unsigned char* zStr /* Write the answer here */ +) { + static const char zDigits[] = "0123456789abcdef"; + int i, k; + unsigned char x; + k = 0; + for (i = 0, k = 0x550; i < 16; i++, k = k >> 1) { + if (k & 1) { + zStr[0] = '-'; + zStr++; + } + x = aBlob[i]; + zStr[0] = zDigits[x >> 4]; + zStr[1] = zDigits[x & 0xf]; + zStr += 2; + } + *zStr = 0; +} + +/* + * Attempt to parse a zero-terminated input string zStr into a binary + * UUID. Return 0 on success, or non-zero if the input string is not + * parsable. + */ +static int sqlite3_uuid_str_to_blob(const unsigned char* zStr, /* Input string */ + unsigned char* aBlob /* Write results here */ +) { + int i; + if (zStr[0] == '{') + zStr++; + for (i = 0; i < 16; i++) { + if (zStr[0] == '-') + zStr++; + if (isxdigit(zStr[0]) && isxdigit(zStr[1])) { + aBlob[i] = (sqlite3UuidHexToInt(zStr[0]) << 4) + sqlite3UuidHexToInt(zStr[1]); + zStr += 2; + } else { + return 1; + } + } + if (zStr[0] == '}') + zStr++; + return zStr[0] != 0; +} + +/* + * Render sqlite3_value pIn as a 16-byte UUID blob. Return a pointer + * to the blob, or NULL if the input is not well-formed. + */ +static const unsigned char* sqlite3_uuid_input_to_blob(sqlite3_value* pIn, /* Input text */ + unsigned char* pBuf /* output buffer */ +) { + switch (sqlite3_value_type(pIn)) { + case SQLITE_TEXT: { + const unsigned char* z = sqlite3_value_text(pIn); + if (sqlite3_uuid_str_to_blob(z, pBuf)) + return 0; + return pBuf; + } + case SQLITE_BLOB: { + int n = sqlite3_value_bytes(pIn); + return n == 16 ? sqlite3_value_blob(pIn) : 0; + } + default: { + return 0; + } + } +} + +/* + * uuid_v4_generate generates a version 4 UUID as a string + */ +static void uuid_v4_generate(sqlite3_context* context, int argc, sqlite3_value** argv) { + unsigned char aBlob[16]; + unsigned char zStr[37]; + (void)argc; + (void)argv; + sqlite3_randomness(16, aBlob); + aBlob[6] = (aBlob[6] & 0x0f) + 0x40; + aBlob[8] = (aBlob[8] & 0x3f) + 0x80; + sqlite3_uuid_blob_to_str(aBlob, zStr); + sqlite3_result_text(context, (char*)zStr, 36, SQLITE_TRANSIENT); +} + +// Time functions are not available of some older systems like 32-bit Windows. +#ifndef SQLEAN_OMIT_UUID7 +/* + * uuid_v7_generate generates a version 7 UUID as a string + */ +static void uuid_v7_generate(sqlite3_context* context, int argc, sqlite3_value** argv) { + unsigned char aBlob[16]; + unsigned char zStr[37]; + (void)argc; + (void)argv; + + struct timespec ts; + timespec_get(&ts, TIME_UTC); + unsigned long long timestampMs = ts.tv_sec * 1000ULL + ts.tv_nsec / 1000000; + + sqlite3_randomness(16, aBlob); + aBlob[0] = timestampMs >> 40; + aBlob[1] = timestampMs >> 32; + aBlob[2] = timestampMs >> 24; + aBlob[3] = timestampMs >> 16; + aBlob[4] = timestampMs >> 8; + aBlob[5] = timestampMs; + aBlob[6] = (aBlob[6] & 0x0f) + 0x70; + aBlob[8] = (aBlob[8] & 0x3f) + 0x80; + sqlite3_uuid_blob_to_str(aBlob, zStr); + sqlite3_result_text(context, (char*)zStr, 36, SQLITE_TRANSIENT); +} + +/* + * uuid_v7_extract_timestamp_ms extract unix timestamp in miliseconds + * from a version 7 UUID. + * X can be either a string or a blob. + * If X is not a version 7 UUID, return NULL. + */ +static void uuid_v7_extract_timestamp_ms(sqlite3_context* context, int argc, sqlite3_value** argv) { + unsigned char aBlob[16]; + const unsigned char* pBlob; + (void)argc; + pBlob = sqlite3_uuid_input_to_blob(argv[0], aBlob); + if (pBlob == 0 || (pBlob[6] >> 4) != 7) + return; + + unsigned long long timestampMs = 0; + for (size_t i = 0; i < 6; ++i) { + timestampMs = (timestampMs << 8) + pBlob[i]; + } + + sqlite3_result_int64(context, timestampMs); +} + +#endif + +/* + * uuid_str converts a UUID X into a well-formed UUID string. + * X can be either a string or a blob. + */ +static void uuid_str(sqlite3_context* context, int argc, sqlite3_value** argv) { + unsigned char aBlob[16]; + unsigned char zStr[37]; + const unsigned char* pBlob; + (void)argc; + pBlob = sqlite3_uuid_input_to_blob(argv[0], aBlob); + if (pBlob == 0) + return; + sqlite3_uuid_blob_to_str(pBlob, zStr); + sqlite3_result_text(context, (char*)zStr, 36, SQLITE_TRANSIENT); +} + +/* + * uuid_blob converts a UUID X into a 16-byte blob. + * X can be either a string or a blob. + */ +static void uuid_blob(sqlite3_context* context, int argc, sqlite3_value** argv) { + unsigned char aBlob[16]; + const unsigned char* pBlob; + (void)argc; + pBlob = sqlite3_uuid_input_to_blob(argv[0], aBlob); + if (pBlob == 0) + return; + sqlite3_result_blob(context, pBlob, 16, SQLITE_TRANSIENT); +} + +int uuid_init(sqlite3* db) { + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS; + static const int det_flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "uuid4", 0, flags, 0, uuid_v4_generate, 0, 0); + sqlite3_create_function(db, "gen_random_uuid", 0, flags, 0, uuid_v4_generate, 0, 0); +#ifndef SQLEAN_OMIT_UUID7 + sqlite3_create_function(db, "uuid7", 0, flags, 0, uuid_v7_generate, 0, 0); + sqlite3_create_function(db, "uuid7_timestamp_ms", 1, det_flags, 0, uuid_v7_extract_timestamp_ms, + 0, 0); +#endif + sqlite3_create_function(db, "uuid_str", 1, det_flags, 0, uuid_str, 0, 0); + sqlite3_create_function(db, "uuid_blob", 1, det_flags, 0, uuid_blob, 0, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/uuid/extension.h b/libsql-ffi/bundled/sqlean/uuid/extension.h new file mode 100644 index 0000000000..c3ca15ad34 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/uuid/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Universally Unique IDentifiers (UUIDs) in SQLite + +#ifndef UUID_EXTENSION_H +#define UUID_EXTENSION_H + +#include "sqlite3ext.h" + +int uuid_init(sqlite3* db); + +#endif /* UUID_EXTENSION_H */ diff --git a/libsql-ffi/bundled/sqlean/vsv/extension.c b/libsql-ffi/bundled/sqlean/vsv/extension.c new file mode 100644 index 0000000000..9b85ace9c8 --- /dev/null +++ b/libsql-ffi/bundled/sqlean/vsv/extension.c @@ -0,0 +1,1547 @@ +// vsv extension by Keith Medcalf, Public Domain +// https://github.com/ncruces/kmedcalf-sqlite/blob/main/vsv.c + +// Modified by Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean/ + +// CSV files as virtual tables in SQLite + +/* +** 2020-02-08 modified by Keith Medcalf who also disclaims all copyright +** on the modifications and hereby places this code in the public domain +** +** This file contains the implementation of an SQLite virtual table for +** reading VSV (Variably Separated Values), which are like CSV files, +** but subtly different. VSV supports a number of extensions to the +** CSV format as well as more processing options. +** +** +** Usage: +** +** create virtual table temp.vsv using vsv(...); +** select * from vsv; +** +** The parameters to the vsv module (the vsv(...) part) are as follows: +** +** filename=STRING the filename, passed to the Operating System +** data=STRING alternative data +** schema=STRING Alternate Schema to use +** columns=N columns parsed from the VSV file +** header=BOOL whether or not a header row is present +** skip=N number of leading data rows to skip +** rsep=STRING record separator +** fsep=STRING field separator +** dsep=STRING decimal separator +** validatetext=BOOL validate UTF-8 encoding of text fields +** affinity=AFFINITY affinity to apply to each returned value +** nulls=BOOL empty fields are returned as NULL +** +** +** Defaults: +** +** filename / data nothing. You must provide one or the other +** it is an error to provide both or neither +** schema nothing. If not provided then one will be +** generated for you from the header, or if no +** header is available then autogenerated using +** field names manufactured as cX where X is the +** column number +** columns nothing. If not specified then the number of +** columns is determined by counting the fields +** in the first record of the VSV file (which +** will be the header row if header is specified), +** the number of columns is not parsed from the +** schema even if one is provided +** header=no no header row in the VSV file +** skip=0 do not skip any data rows in the VSV file +** fsep=',' default field separator is a comma +** rsep='\n' default record separator is a newline +** dsep='.' default decimal separator is a point +** validatetext=no do not validate text field encoding +** affinity=none do not apply affinity to each returned value +** nulls=off empty fields returned as zero-length +** +** +** Parameter types: +** +** STRING means a quoted string +** N means a whole number not containing a sign +** BOOL means something that evaluates as true or false +** it is case insensitive +** yes, no, true, false, 1, 0 +** AFFINITY means an SQLite3 type specification +** it is case insensitive +** none, blob, text, integer, real, numeric +** +** STRING means a quoted string. The quote character may be either +** a single quote or a double quote. Two quote characters in a row +** will be replaced with a single quote character. STRINGS do not +** need to be quoted if it is obvious where they begin and end +** (that is, they do not contain a comma). Leading and trailing +** spaces will be trimmed from unquoted strings. +** +** filename =./this/filename.here, ... +** filename =./this/filename.here , ... +** filename = ./this/filename.here, ... +** filename = ./this/filename.here , ... +** filename = './this/filename.here', ... +** filename = "./this/filename.here", ... +** +** are all equivalent. +** +** BOOL defaults to true so the following specifications are all the +** same: +** +** header = true +** header = yes +** header = 1 +** header +** +** +** Specific Parameters: +** +** The platform/compiler/OS fopen call is responsible for interpreting +** the filename. It may contain anything recognized by the OS. +** +** The separator string containing exactly one character, or a valid +** escape sequence. Recognized escape sequences are: +** +** \t horizontal tab, ascii character 9 (0x09) +** \n linefeed, ascii character 10 (0x0a) +** \v vertical tab, ascii character 11 (0x0b) +** \f form feed, ascii character 12 (0x0c) +** \xhh specific byte where hh is hexadecimal +** +** The validatetext setting will cause the validity of the field +** encoding (not its contents) to be verified. It effects how +** fields that are supposed to contain text will be returned to +** the SQLite3 library in order to prevent invalid utf8 data from +** being stored or processed as if it were valid utf8 text. +** +** The nulls option will cause fields that do not contain anything +** to return NULL rather than an empty result. Two separators +** side-by-each with no intervening characters at all will be +** returned as NULL if nulls is true and if nulls is false or +** the contents are explicity empty ("") then a 0 length blob +** (if affinity=blob) or 0 length text string. +** +** For the affinity setting, the following processing is applied to +** each value returned by the VSV virtual table: +** +** none no affinity is applied, all fields will be +** returned as text just like in the original +** csv module, embedded nulls will terminate +** the text. if validatetext is in effect then +** an error will be thrown if the field does +** not contain validly encoded text or contains +** embedded nulls +** +** blob all fields will be returned as blobs +** validatetext has no effect +** +** text all fields will be returned as text just +** like in the original csv module, embedded +** nulls will terminate the text. +** if validatetext is in effect then a blob +** will be returned if the field does not +** contain validly encoded text or the field +** contains embedded nulls +** +** integer if the field data looks like an integer, +** (regex "^ *(\+|-)?\d+ *$"), +** then an integer will be returned as +** provided by the compiler and platform +** runtime strtoll function +** otherwise the field will be processed as +** text as defined above +** +** real if the field data looks like a number, +** (regex "^ *(\+|-)?(\d+\.?\d*|\d*\.?\d+)([eE](\+|-)?\d+)? *$") +** then a double will be returned as +** provided by the compiler and platform +** runtime strtold function otherwise the +** field will be processed as text as +** defined above +** +** numeric if the field looks like an integer +** (see integer above) that integer will be +** returned +** if the field looks like a number +** (see real above) then the number will +** returned as an integer if it has no +** fractional part and +** (a) your platform/compiler supports +** long double and the number will fit in +** a 64-bit integer; or, +** (b) your platform/compiler does not +** support long double (treats it as a double) +** then a 64-bit integer will only be returned +** if the value would fit in a 6-byte varint, +** otherwise a double will be returned +** +** The nulls option will cause fields that do not contain anything +** to return NULL rather than an empty result. Two separators +** side-by-each with no intervening characters at all will be +** returned as NULL if nulls is true and if nulls is false or +** the contents are explicity empty ("") then a 0 length blob +** (if affinity=blob) or 0 length text string will be returned. +** +*/ +/* +** 2016-05-28 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This file contains the implementation of an SQLite virtual table for +** reading CSV files. +** +** Usage: +** +** .load ./csv +** CREATE VIRTUAL TABLE temp.csv USING csv(filename=FILENAME); +** SELECT * FROM csv; +** +** The columns are named "c1", "c2", "c3", ... by default. Or the +** application can define its own CREATE TABLE statement using the +** schema= parameter, like this: +** +** CREATE VIRTUAL TABLE temp.csv2 USING csv( +** filename = "../http.log", +** schema = "CREATE TABLE x(date,ipaddr,url,referrer,userAgent)" +** ); +** +** Instead of specifying a file, the text of the CSV can be loaded using +** the data= parameter. +** +** If the columns=N parameter is supplied, then the CSV file is assumed to have +** N columns. If both the columns= and schema= parameters are omitted, then +** the number and names of the columns is determined by the first line of +** the CSV input. +** +*/ +#include +#include +#include +#include +#include +#include +#include + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +/**A macro to hint to the compiler that a function should not be * *inlined.*/ +#if defined(__GNUC__) +#define VSV_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) && _MSC_VER >= 1310 +#define VSV_NOINLINE __declspec(noinline) +#else +#define VSV_NOINLINE +#endif + +/* +** Max size of the error message in a VsvReader +*/ +#define VSV_MXERR 200 + +/* +** Size of the VsvReader input buffer +*/ +#define VSV_INBUFSZ 1024 + +/* +** A context object used when read a VSV file. +*/ +typedef struct VsvReader VsvReader; +struct VsvReader { + FILE* in; /* Read the VSV text from this input stream */ + char* z; /* Accumulated text for a field */ + int n; /* Number of bytes in z */ + int nAlloc; /* Space allocated for z[] */ + int nLine; /* Current line number */ + int bNotFirst; /* True if prior text has been seen */ + int cTerm; /* Character that terminated the most recent field */ + int fsep; /* Field Seperator Character */ + int rsep; /* Record Seperator Character */ + int dsep; /* Decimal Seperator Character */ + int affinity; /* Perform Affinity Conversions */ + int notNull; /* Have we seen data for field */ + size_t iIn; /* Next unread character in the input buffer */ + size_t nIn; /* Number of characters in the input buffer */ + char* zIn; /* The input buffer */ + char zErr[VSV_MXERR]; /* Error message */ +}; + +/* +** Initialize a VsvReader object +*/ +static void vsv_reader_init(VsvReader* p) { + p->in = 0; + p->z = 0; + p->n = 0; + p->nAlloc = 0; + p->nLine = 0; + p->bNotFirst = 0; + p->nIn = 0; + p->zIn = 0; + p->notNull = 0; + p->zErr[0] = 0; +} + +/* +** Close and reset a VsvReader object +*/ +static void vsv_reader_reset(VsvReader* p) { + if (p->in) { + fclose(p->in); + sqlite3_free(p->zIn); + } + sqlite3_free(p->z); + vsv_reader_init(p); +} + +/* +** Report an error on a VsvReader +*/ +static void vsv_errmsg(VsvReader* p, const char* zFormat, ...) { + va_list ap; + va_start(ap, zFormat); + sqlite3_vsnprintf(VSV_MXERR, p->zErr, zFormat, ap); + va_end(ap); +} + +/* +** Open the file associated with a VsvReader +** Return the number of errors. +*/ +static int vsv_reader_open(VsvReader* p, /* The reader to open */ + const char* zFilename, /* Read from this filename */ + const char* zData /* ... or use this data */ +) { + if (zFilename) { + p->zIn = sqlite3_malloc(VSV_INBUFSZ); + if (p->zIn == 0) { + vsv_errmsg(p, "out of memory"); + return 1; + } + p->in = fopen(zFilename, "rb"); + if (p->in == 0) { + sqlite3_free(p->zIn); + vsv_reader_reset(p); + vsv_errmsg(p, "cannot open '%s' for reading", zFilename); + return 1; + } + } else { + assert(p->in == 0); + p->zIn = (char*)zData; + p->nIn = strlen(zData); + } + return 0; +} + +/* +** The input buffer has overflowed. Refill the input buffer, then +** return the next character +*/ +static VSV_NOINLINE int vsv_getc_refill(VsvReader* p) { + size_t got; + + assert(p->iIn >= p->nIn); /* Only called on an empty input buffer */ + assert(p->in != 0); /* Only called if reading from a file */ + + got = fread(p->zIn, 1, VSV_INBUFSZ, p->in); + if (got == 0) { + return EOF; + } + p->nIn = got; + p->iIn = 1; + return p->zIn[0]; +} + +/* +** Return the next character of input. Return EOF at end of input. +*/ +static int vsv_getc(VsvReader* p) { + if (p->iIn >= p->nIn) { + if (p->in != 0) { + return vsv_getc_refill(p); + } + return EOF; + } + return ((unsigned char*)p->zIn)[p->iIn++]; +} + +/* +** Increase the size of p->z and append character c to the end. +** Return 0 on success and non-zero if there is an OOM error +*/ +static VSV_NOINLINE int vsv_resize_and_append(VsvReader* p, char c) { + char* zNew; + int nNew = p->nAlloc * 2 + 100; + zNew = sqlite3_realloc64(p->z, nNew); + if (zNew) { + p->z = zNew; + p->nAlloc = nNew; + p->z[p->n++] = c; + return 0; + } else { + vsv_errmsg(p, "out of memory"); + return 1; + } +} + +/* +** Append a single character to the VsvReader.z[] array. +** Return 0 on success and non-zero if there is an OOM error +*/ +static int vsv_append(VsvReader* p, char c) { + if (p->n >= p->nAlloc - 1) { + return vsv_resize_and_append(p, c); + } + p->z[p->n++] = c; + return 0; +} + +/* +** Read a single field of VSV text. Compatible with rfc4180 and extended +** with the option of having a separator other than ",". +** +** + Input comes from p->in. +** + Store results in p->z of length p->n. Space to hold p->z comes +** from sqlite3_malloc64(). +** + Keep track of the line number in p->nLine. +** + Store the character that terminates the field in p->cTerm. Store +** EOF on end-of-file. +** +** Return 0 at EOF or on OOM. On EOF, the p->cTerm character will have +** been set to EOF. +*/ +static char* vsv_read_one_field(VsvReader* p) { + int c; + p->notNull = 0; + p->n = 0; + c = vsv_getc(p); + if (c == EOF) { + p->cTerm = EOF; + return 0; + } + if (c == '"') { + int pc, ppc; + int startLine = p->nLine; + p->notNull = 1; + pc = ppc = 0; + while (1) { + c = vsv_getc(p); + if (c == '\n') { + p->nLine++; + } + if (c == '"' && pc == '"') { + pc = ppc; + ppc = 0; + continue; + } + if ((c == p->fsep && pc == '"') || (c == p->rsep && pc == '"') || + (p->rsep == '\n' && c == '\n' && pc == '\r' && ppc == '"') || + (c == EOF && pc == '"')) { + do { + p->n--; + } while (p->z[p->n] != '"'); + p->cTerm = (char)c; + break; + } + if (pc == '"' && p->rsep == '\n' && c != '\r') { + vsv_errmsg(p, "line %d: unescaped %c character", p->nLine, '"'); + break; + } + if (c == EOF) { + vsv_errmsg(p, "line %d: unterminated %c-quoted field\n", startLine, '"'); + p->cTerm = (char)c; + break; + } + if (vsv_append(p, (char)c)) { + return 0; + } + ppc = pc; + pc = c; + } + } else { + /* + ** If this is the first field being parsed and it begins with the + ** UTF-8 BOM (0xEF BB BF) then skip the BOM + */ + if ((c & 0xff) == 0xef && p->bNotFirst == 0) { + vsv_append(p, (char)c); + c = vsv_getc(p); + if ((c & 0xff) == 0xbb) { + vsv_append(p, (char)c); + c = vsv_getc(p); + if ((c & 0xff) == 0xbf) { + p->bNotFirst = 1; + p->n = 0; + return vsv_read_one_field(p); + } + } + } + while (c != EOF && c != p->rsep && c != p->fsep) { + if (c == '\n') + p->nLine++; + if (!p->notNull) + p->notNull = 1; + if (vsv_append(p, (char)c)) + return 0; + c = vsv_getc(p); + } + if (c == '\n') { + p->nLine++; + } + if (p->n > 0 && (p->rsep == '\n' || p->fsep == '\n') && p->z[p->n - 1] == '\r') { + p->n--; + if (p->n == 0) { + p->notNull = 0; + } + } + p->cTerm = (char)c; + } + if (p->z) { + p->z[p->n] = 0; + } + p->bNotFirst = 1; + return p->z; +} + +/* +** Forward references to the various virtual table methods implemented +** in this file. +*/ +static int vsvtabCreate(sqlite3*, void*, int, const char* const*, sqlite3_vtab**, char**); +static int vsvtabConnect(sqlite3*, void*, int, const char* const*, sqlite3_vtab**, char**); +static int vsvtabBestIndex(sqlite3_vtab*, sqlite3_index_info*); +static int vsvtabDisconnect(sqlite3_vtab*); +static int vsvtabOpen(sqlite3_vtab*, sqlite3_vtab_cursor**); +static int vsvtabClose(sqlite3_vtab_cursor*); +static int vsvtabFilter(sqlite3_vtab_cursor*, + int idxNum, + const char* idxStr, + int argc, + sqlite3_value** argv); +static int vsvtabNext(sqlite3_vtab_cursor*); +static int vsvtabEof(sqlite3_vtab_cursor*); +static int vsvtabColumn(sqlite3_vtab_cursor*, sqlite3_context*, int); +static int vsvtabRowid(sqlite3_vtab_cursor*, sqlite3_int64*); + +/* +** An instance of the VSV virtual table +*/ +typedef struct VsvTable { + sqlite3_vtab base; /* Base class. Must be first */ + char* zFilename; /* Name of the VSV file */ + char* zData; /* Raw VSV data in lieu of zFilename */ + long iStart; /* Offset to start of data in zFilename */ + int nCol; /* Number of columns in the VSV file */ + int fsep; /* The field seperator for this VSV file */ + int rsep; /* The record seperator for this VSV file */ + int dsep; /* The record decimal for this VSV file */ + int affinity; /* Perform affinity conversions */ + int nulls; /* Process NULLs */ + int validateUTF8; /* Validate UTF8 */ +} VsvTable; + +/* +** A cursor for the VSV virtual table +*/ +typedef struct VsvCursor { + sqlite3_vtab_cursor base; /* Base class. Must be first */ + VsvReader rdr; /* The VsvReader object */ + char** azVal; /* Value of the current row */ + int* aLen; /* Allocation Length of each entry */ + int* dLen; /* Data Length of each entry */ + sqlite3_int64 iRowid; /* The current rowid. Negative for EOF */ +} VsvCursor; + +/* +** Transfer error message text from a reader into a VsvTable +*/ +static void vsv_xfer_error(VsvTable* pTab, VsvReader* pRdr) { + sqlite3_free(pTab->base.zErrMsg); + pTab->base.zErrMsg = sqlite3_mprintf("%s", pRdr->zErr); +} + +/* +** This method is the destructor for a VsvTable object. +*/ +static int vsvtabDisconnect(sqlite3_vtab* pVtab) { + VsvTable* p = (VsvTable*)pVtab; + sqlite3_free(p->zFilename); + sqlite3_free(p->zData); + sqlite3_free(p); + return SQLITE_OK; +} + +/* +** Skip leading whitespace. Return a pointer to the first non-whitespace +** character, or to the zero terminator if the string has only whitespace +*/ +static const char* vsv_skip_whitespace(const char* z) { + while (isspace((unsigned char)z[0])) { + z++; + } + return z; +} + +/* +** Remove trailing whitespace from the end of string z[] +*/ +static void vsv_trim_whitespace(char* z) { + size_t n = strlen(z); + while (n > 0 && isspace((unsigned char)z[n])) { + n--; + } + z[n] = 0; +} + +/* +** Dequote the string +*/ +static void vsv_dequote(char* z) { + int j; + char cQuote = z[0]; + size_t i, n; + + if (cQuote != '\'' && cQuote != '"') { + return; + } + n = strlen(z); + if (n < 2 || z[n - 1] != z[0]) { + return; + } + for (i = 1, j = 0; i < n - 1; i++) { + if (z[i] == cQuote && z[i + 1] == cQuote) { + i++; + } + z[j++] = z[i]; + } + z[j] = 0; +} + +/* +** Check to see if the string is of the form: "TAG = VALUE" with optional +** whitespace before and around tokens. If it is, return a pointer to the +** first character of VALUE. If it is not, return NULL. +*/ +static const char* vsv_parameter(const char* zTag, int nTag, const char* z) { + z = vsv_skip_whitespace(z); + if (strncmp(zTag, z, nTag) != 0) { + return 0; + } + z = vsv_skip_whitespace(z + nTag); + if (z[0] != '=') { + return 0; + } + return vsv_skip_whitespace(z + 1); +} + +/* +** Decode a parameter that requires a dequoted string. +** +** Return 1 if the parameter is seen, or 0 if not. 1 is returned +** even if there is an error. If an error occurs, then an error message +** is left in p->zErr. If there are no errors, p->zErr[0]==0. +*/ +static int vsv_string_parameter(VsvReader* p, /* Leave the error message here, if there is one */ + const char* zParam, /* Parameter we are checking for */ + const char* zArg, /* Raw text of the virtual table argment */ + char** pzVal /* Write the dequoted string value here */ +) { + const char* zValue; + zValue = vsv_parameter(zParam, (int)strlen(zParam), zArg); + if (zValue == 0) { + return 0; + } + p->zErr[0] = 0; + if (*pzVal) { + vsv_errmsg(p, "more than one '%s' parameter", zParam); + return 1; + } + *pzVal = sqlite3_mprintf("%s", zValue); + if (*pzVal == 0) { + vsv_errmsg(p, "out of memory"); + return 1; + } + vsv_trim_whitespace(*pzVal); + vsv_dequote(*pzVal); + return 1; +} + +/* +** Return 0 if the argument is false and 1 if it is true. Return -1 if +** we cannot really tell. +*/ +static int vsv_boolean(const char* z) { + if (sqlite3_stricmp("yes", z) == 0 || sqlite3_stricmp("on", z) == 0 || + sqlite3_stricmp("true", z) == 0 || (z[0] == '1' && z[1] == 0)) { + return 1; + } + if (sqlite3_stricmp("no", z) == 0 || sqlite3_stricmp("off", z) == 0 || + sqlite3_stricmp("false", z) == 0 || (z[0] == '0' && z[1] == 0)) { + return 0; + } + return -1; +} + +/* +** Check to see if the string is of the form: "TAG = BOOLEAN" or just "TAG". +** If it is, set *pValue to be the value of the boolean ("true" if there is +** not "= BOOLEAN" component) and return non-zero. If the input string +** does not begin with TAG, return zero. +*/ +static int vsv_boolean_parameter(const char* zTag, /* Tag we are looking for */ + int nTag, /* Size of the tag in bytes */ + const char* z, /* Input parameter */ + int* pValue /* Write boolean value here */ +) { + int b; + z = vsv_skip_whitespace(z); + if (strncmp(zTag, z, nTag) != 0) { + return 0; + } + z = vsv_skip_whitespace(z + nTag); + if (z[0] == 0) { + *pValue = 1; + return 1; + } + if (z[0] != '=') { + return 0; + } + z = vsv_skip_whitespace(z + 1); + b = vsv_boolean(z); + if (b >= 0) { + *pValue = b; + return 1; + } + return 0; +} + +/* +** Convert the seperator character specification into the character code +** Return 1 signifies error, 0 for no error +** +** Recognized inputs: +** any single character +** escaped characters \f \n \t \v +** escaped hex byte \x1e \x1f etc (RS and US respectively) +** +*/ +static int vsv_parse_sep_char(char* in, int dflt, int* out) { + if (!in) { + *out = dflt; + return 0; + } + switch (strlen(in)) { + case 0: { + *out = dflt; + return 0; + } + case 1: { + *out = in[0]; + return 0; + } + case 2: { + if (in[0] != '\\') { + return 1; + } + switch (in[1]) { + case 'f': { + *out = 12; + return 0; + } + case 'n': { + *out = 10; + return 0; + } + case 't': { + *out = 9; + return 0; + } + case 'v': { + *out = 11; + return 0; + } + } + return 1; + } + case 4: { + if (sqlite3_strnicmp(in, "\\x", 2) != 0) { + return 1; + } + if (!isxdigit(in[2]) || !isxdigit(in[3])) { + return 1; + } + *out = ((in[2] > '9' ? (in[2] & 0x0f) + 9 : in[2] & 0x0f) << 4) + + (in[3] > '9' ? (in[3] & 0x0f) + 9 : in[3] & 0x0f); + return 0; + } + } + return 0; +} + +/* +** Parameters: +** filename=FILENAME Name of file containing VSV content +** data=TEXT Direct VSV content. +** schema=SCHEMA Alternative VSV schema. +** header=YES|NO First row of VSV defines the names of +** columns if "yes". Default "no". +** columns=N Assume the VSV file contains N columns. +** fsep=FSET Field Seperator +** rsep=RSEP Record Seperator +** dsep=RSEP Decimal Seperator +** skip=N skip N records of file (default 0) +** affinity=AFF affinity to apply to ALL columns +** default: none +** none text integer real numeric +** +** If schema= is omitted, then the columns are named "c0", "c1", "c2", +** and so forth. If columns=N is omitted, then the file is opened and +** the number of columns in the first row is counted to determine the +** column count. If header=YES, then the first row is skipped. +*/ +static int vsvtabConnect(sqlite3* db, + void* pAux, + int argc, + const char* const* argv, + sqlite3_vtab** ppVtab, + char** pzErr) { + VsvTable* pNew = 0; /* The VsvTable object to construct */ + int affinity = -1; /* Affinity coercion */ + int bHeader = -1; /* header= flags. -1 means not seen yet */ + int validateUTF8 = -1; /* validateUTF8 flag */ + int rc = SQLITE_OK; /* Result code from this routine */ + size_t i, j; /* Loop counters */ + int b; /* Value of a boolean parameter */ + int nCol = -99; /* Value of the columns= parameter */ + int nSkip = -1; /* Value of the skip= parameter */ + int bNulls = -1; /* Process Nulls flag */ + VsvReader sRdr; /* A VSV file reader used to store an error + ** message and/or to count the number of columns */ + static const char* azParam[] = {"filename", "data", "schema", "fsep", "rsep", "dsep"}; + char* azPValue[6]; /* Parameter values */ +#define VSV_FILENAME (azPValue[0]) +#define VSV_DATA (azPValue[1]) +#define VSV_SCHEMA (azPValue[2]) +#define VSV_FSEP (azPValue[3]) +#define VSV_RSEP (azPValue[4]) +#define VSV_DSEP (azPValue[5]) + + assert(sizeof(azPValue) == sizeof(azParam)); + memset(&sRdr, 0, sizeof(sRdr)); + memset(azPValue, 0, sizeof(azPValue)); + for (i = 3; i < (size_t)argc; i++) { + const char* z = argv[i]; + const char* zValue; + for (j = 0; j < sizeof(azParam) / sizeof(azParam[0]); j++) { + if (vsv_string_parameter(&sRdr, azParam[j], z, &azPValue[j])) { + break; + } + } + if (j < sizeof(azParam) / sizeof(azParam[0])) { + if (sRdr.zErr[0]) { + goto vsvtab_connect_error; + } + } else if (vsv_boolean_parameter("header", 6, z, &b)) { + if (bHeader >= 0) { + vsv_errmsg(&sRdr, "more than one 'header' parameter"); + goto vsvtab_connect_error; + } + bHeader = b; + } else if (vsv_boolean_parameter("validatetext", 12, z, &b)) { + if (validateUTF8 >= 0) { + vsv_errmsg(&sRdr, "more than one 'validatetext' parameter"); + goto vsvtab_connect_error; + } + validateUTF8 = b; + } else if (vsv_boolean_parameter("nulls", 5, z, &b)) { + if (bNulls >= 0) { + vsv_errmsg(&sRdr, "more than one 'nulls' parameter"); + goto vsvtab_connect_error; + } + bNulls = b; + } else if ((zValue = vsv_parameter("columns", 7, z)) != 0) { + if (nCol > 0) { + vsv_errmsg(&sRdr, "more than one 'columns' parameter"); + goto vsvtab_connect_error; + } + nCol = atoi(zValue); + if (nCol <= 0) { + vsv_errmsg(&sRdr, "column= value must be positive"); + goto vsvtab_connect_error; + } + } else if ((zValue = vsv_parameter("skip", 4, z)) != 0) { + if (nSkip > 0) { + vsv_errmsg(&sRdr, "more than one 'skip' parameter"); + goto vsvtab_connect_error; + } + nSkip = atoi(zValue); + if (nSkip <= 0) { + vsv_errmsg(&sRdr, "skip= value must be positive"); + goto vsvtab_connect_error; + } + } else if ((zValue = vsv_parameter("affinity", 8, z)) != 0) { + if (affinity > -1) { + vsv_errmsg(&sRdr, "more than one 'affinity' parameter"); + goto vsvtab_connect_error; + } + if (sqlite3_strnicmp(zValue, "none", 4) == 0) + affinity = 0; + else if (sqlite3_strnicmp(zValue, "blob", 4) == 0) + affinity = 1; + else if (sqlite3_strnicmp(zValue, "text", 4) == 0) + affinity = 2; + else if (sqlite3_strnicmp(zValue, "integer", 7) == 0) + affinity = 3; + else if (sqlite3_strnicmp(zValue, "real", 4) == 0) + affinity = 4; + else if (sqlite3_strnicmp(zValue, "numeric", 7) == 0) + affinity = 5; + else { + vsv_errmsg(&sRdr, "unknown affinity: '%s'", zValue); + goto vsvtab_connect_error; + } + } else { + vsv_errmsg(&sRdr, "bad parameter: '%s'", z); + goto vsvtab_connect_error; + } + } + if (affinity == -1) { + affinity = 0; + } + if (bNulls == -1) { + bNulls = 0; + } + if (validateUTF8 == -1) { + validateUTF8 = 0; + } + if ((VSV_FILENAME == 0) == (VSV_DATA == 0)) { + vsv_errmsg(&sRdr, "must specify either filename= or data= but not both"); + goto vsvtab_connect_error; + } + if (vsv_parse_sep_char(VSV_FSEP, ',', &(sRdr.fsep))) { + vsv_errmsg(&sRdr, "cannot parse fsep: '%s'", VSV_FSEP); + goto vsvtab_connect_error; + } + if (vsv_parse_sep_char(VSV_RSEP, '\n', &(sRdr.rsep))) { + vsv_errmsg(&sRdr, "cannot parse rsep: '%s'", VSV_RSEP); + goto vsvtab_connect_error; + } + if (vsv_parse_sep_char(VSV_DSEP, '.', &(sRdr.dsep))) { + vsv_errmsg(&sRdr, "cannot parse dsep: '%s'", VSV_DSEP); + goto vsvtab_connect_error; + } + if ((nCol <= 0 || bHeader == 1) && vsv_reader_open(&sRdr, VSV_FILENAME, VSV_DATA)) { + goto vsvtab_connect_error; + } + pNew = sqlite3_malloc(sizeof(*pNew)); + *ppVtab = (sqlite3_vtab*)pNew; + if (pNew == 0) { + goto vsvtab_connect_oom; + } + memset(pNew, 0, sizeof(*pNew)); + pNew->fsep = sRdr.fsep; + pNew->rsep = sRdr.rsep; + pNew->dsep = sRdr.dsep; + pNew->affinity = affinity; + pNew->validateUTF8 = validateUTF8; + pNew->nulls = bNulls; + if (VSV_SCHEMA == 0) { + sqlite3_str* pStr = sqlite3_str_new(0); + char* zSep = ""; + int iCol = 0; + sqlite3_str_appendf(pStr, "CREATE TABLE x("); + if (nCol < 0 && bHeader < 1) { + nCol = 0; + do { + vsv_read_one_field(&sRdr); + nCol++; + } while (sRdr.cTerm == sRdr.fsep); + } + if (nCol > 0 && bHeader < 1) { + for (iCol = 0; iCol < nCol; iCol++) { + sqlite3_str_appendf(pStr, "%sc%d", zSep, iCol); + zSep = ","; + } + } else { + do { + char* z = vsv_read_one_field(&sRdr); + if ((nCol > 0 && iCol < nCol) || (nCol < 0 && bHeader)) { + sqlite3_str_appendf(pStr, "%s\"%w\"", zSep, z); + zSep = ","; + iCol++; + } + } while (sRdr.cTerm == sRdr.fsep); + if (nCol < 0) { + nCol = iCol; + } else { + while (iCol < nCol) { + sqlite3_str_appendf(pStr, "%sc%d", zSep, ++iCol); + zSep = ","; + } + } + } + sqlite3_str_appendf(pStr, ")"); + VSV_SCHEMA = sqlite3_str_finish(pStr); + if (VSV_SCHEMA == 0) { + goto vsvtab_connect_oom; + } + } else if (nCol < 0) { + do { + vsv_read_one_field(&sRdr); + nCol++; + } while (sRdr.cTerm == sRdr.fsep); + } else if (nSkip < 1 && bHeader == 1) { + do { + vsv_read_one_field(&sRdr); + } while (sRdr.cTerm == sRdr.fsep); + } + pNew->nCol = nCol; + if (nSkip > 0) { + int tskip = nSkip + (bHeader == 1); + vsv_reader_reset(&sRdr); + if (vsv_reader_open(&sRdr, VSV_FILENAME, VSV_DATA)) { + goto vsvtab_connect_error; + } + do { + do { + if (!vsv_read_one_field(&sRdr)) + goto vsvtab_connect_error; + } while (sRdr.cTerm == sRdr.fsep); + tskip--; + } while (tskip > 0 && sRdr.cTerm == sRdr.rsep); + if (tskip > 0) { + vsv_errmsg(&sRdr, "premature end of file during skip"); + goto vsvtab_connect_error; + } + } + pNew->zFilename = VSV_FILENAME; + VSV_FILENAME = 0; + pNew->zData = VSV_DATA; + VSV_DATA = 0; + if (bHeader != 1 && nSkip < 1) { + pNew->iStart = 0; + } else if (pNew->zData) { + pNew->iStart = (int)sRdr.iIn; + } else { + pNew->iStart = (int)(ftell(sRdr.in) - sRdr.nIn + sRdr.iIn); + } + vsv_reader_reset(&sRdr); + rc = sqlite3_declare_vtab(db, VSV_SCHEMA); + if (rc) { + vsv_errmsg(&sRdr, "bad schema: '%s' - %s", VSV_SCHEMA, sqlite3_errmsg(db)); + goto vsvtab_connect_error; + } + for (i = 0; i < sizeof(azPValue) / sizeof(azPValue[0]); i++) { + sqlite3_free(azPValue[i]); + } + /* + ** Rationale for DIRECTONLY: + ** An attacker who controls a database schema could use this vtab + ** to exfiltrate sensitive data from other files in the filesystem. + ** And, recommended practice is to put all VSV virtual tables in the + ** TEMP namespace, so they should still be usable from within TEMP + ** views, so there shouldn't be a serious loss of functionality by + ** prohibiting the use of this vtab from persistent triggers and views. + */ + sqlite3_vtab_config(db, SQLITE_VTAB_DIRECTONLY); + return SQLITE_OK; + +vsvtab_connect_oom: + rc = SQLITE_NOMEM; + vsv_errmsg(&sRdr, "out of memory"); + +vsvtab_connect_error: + if (pNew) { + vsvtabDisconnect(&pNew->base); + } + for (i = 0; i < sizeof(azPValue) / sizeof(azPValue[0]); i++) { + sqlite3_free(azPValue[i]); + } + if (sRdr.zErr[0]) { + sqlite3_free(*pzErr); + *pzErr = sqlite3_mprintf("%s", sRdr.zErr); + } + vsv_reader_reset(&sRdr); + if (rc == SQLITE_OK) { + rc = SQLITE_ERROR; + } + return rc; +} + +/* +** Reset the current row content held by a VsvCursor. +*/ +static void vsvtabCursorRowReset(VsvCursor* pCur) { + VsvTable* pTab = (VsvTable*)pCur->base.pVtab; + int i; + for (i = 0; i < pTab->nCol; i++) { + sqlite3_free(pCur->azVal[i]); + pCur->azVal[i] = 0; + pCur->aLen[i] = 0; + pCur->dLen[i] = -1; + } +} + +/* +** The xConnect and xCreate methods do the same thing, but they must be +** different so that the virtual table is not an eponymous virtual table. +*/ +static int vsvtabCreate(sqlite3* db, + void* pAux, + int argc, + const char* const* argv, + sqlite3_vtab** ppVtab, + char** pzErr) { + return vsvtabConnect(db, pAux, argc, argv, ppVtab, pzErr); +} + +/* +** Destructor for a VsvCursor. +*/ +static int vsvtabClose(sqlite3_vtab_cursor* cur) { + VsvCursor* pCur = (VsvCursor*)cur; + vsvtabCursorRowReset(pCur); + vsv_reader_reset(&pCur->rdr); + sqlite3_free(cur); + return SQLITE_OK; +} + +/* +** Constructor for a new VsvTable cursor object. +*/ +static int vsvtabOpen(sqlite3_vtab* p, sqlite3_vtab_cursor** ppCursor) { + VsvTable* pTab = (VsvTable*)p; + VsvCursor* pCur; + size_t nByte; + nByte = sizeof(*pCur) + (sizeof(char*) + (2 * sizeof(int))) * pTab->nCol; + pCur = sqlite3_malloc64(nByte); + if (pCur == 0) + return SQLITE_NOMEM; + memset(pCur, 0, nByte); + pCur->azVal = (char**)&pCur[1]; + pCur->aLen = (int*)&pCur->azVal[pTab->nCol]; + pCur->dLen = (int*)&pCur->aLen[pTab->nCol]; + pCur->rdr.fsep = pTab->fsep; + pCur->rdr.rsep = pTab->rsep; + pCur->rdr.dsep = pTab->dsep; + pCur->rdr.affinity = pTab->affinity; + *ppCursor = &pCur->base; + if (vsv_reader_open(&pCur->rdr, pTab->zFilename, pTab->zData)) { + vsv_xfer_error(pTab, &pCur->rdr); + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +/* +** Advance a VsvCursor to its next row of input. +** Set the EOF marker if we reach the end of input. +*/ +static int vsvtabNext(sqlite3_vtab_cursor* cur) { + VsvCursor* pCur = (VsvCursor*)cur; + VsvTable* pTab = (VsvTable*)cur->pVtab; + int i = 0; + char* z; + do { + z = vsv_read_one_field(&pCur->rdr); + if (z == 0) { + if (i < pTab->nCol) + pCur->dLen[i] = -1; + } else if (i < pTab->nCol) { + if (pCur->aLen[i] < pCur->rdr.n + 1) { + char* zNew = sqlite3_realloc64(pCur->azVal[i], pCur->rdr.n + 1); + if (zNew == 0) { + z = 0; + vsv_errmsg(&pCur->rdr, "out of memory"); + vsv_xfer_error(pTab, &pCur->rdr); + break; + } + pCur->azVal[i] = zNew; + pCur->aLen[i] = pCur->rdr.n + 1; + } + if (!pCur->rdr.notNull && pTab->nulls) { + pCur->dLen[i] = -1; + } else { + pCur->dLen[i] = pCur->rdr.n; + memcpy(pCur->azVal[i], z, pCur->rdr.n + 1); + } + i++; + } + } while (pCur->rdr.cTerm == pCur->rdr.fsep); + if ((pCur->rdr.cTerm == EOF && i == 0)) { + pCur->iRowid = -1; + } else { + pCur->iRowid++; + while (i < pTab->nCol) { + pCur->dLen[i] = -1; + i++; + } + } + return SQLITE_OK; +} + +/* +** +** Determine affinity of field +** +** ignore leading space +** then may have + or - +** then may have digits or . (if . found then type=real) +** then may have digits (if another . then not number) +** then may have e (if found then type=real) +** then may have + or - +** then may have digits +** then may have trailing space +*/ +static int vsv_isValidNumber(int dsep, char* arg) { + char* start; + char* stop; + int isValid = 0; + int hasDigit = 0; + + start = arg; + stop = arg + strlen(arg) - 1; + while (start <= stop && *start == ' ') // strip spaces from begining + { + start++; + } + while (start <= stop && *stop == ' ') // strip spaces from end + { + stop--; + } + if (start > stop) { + goto vsv_end_isValidNumber; + } + if (start <= stop && (*start == '+' || *start == '-')) // may have + or - + { + start++; + } + if (start <= stop && isdigit(*start)) // must have a digit to be valid + { + hasDigit = 1; + isValid = 1; + } + while (start <= stop && isdigit(*start)) // bunch of digits + { + start++; + } + if (start <= stop && *start == dsep) // may have decimal separator + { + isValid = 2; + if (*start != '.') { + *start = '.'; + } + start++; + } + if (start <= stop && isdigit(*start)) { + hasDigit = 1; + } + while (start <= stop && isdigit(*start)) // bunch of digits + { + start++; + } + if (!hasDigit) // no digits then invalid + { + isValid = 0; + goto vsv_end_isValidNumber; + } + if (start <= stop && (*start == 'e' || *start == 'E')) // may have 'e' or 'E' + { + isValid = 3; + start++; + } + if (start <= stop && isValid == 3 && (*start == '+' || *start == '-')) { + start++; + } + if (start <= stop && isValid == 3 && isdigit(*start)) { + isValid = 2; + } + while (start <= stop && isdigit(*start)) // bunch of digits + { + start++; + } + if (isValid == 3) { + isValid = 0; + } +vsv_end_isValidNumber: + if (start <= stop) { + isValid = 0; + } + return isValid; +} + +/* +** Validate UTF-8 +** Return -1 if invalid else length +*/ +static long long vsv_utf8IsValid(char* string) { + long long length = 0; + unsigned char* start; + int trailing = 0; + unsigned char c; + + start = (unsigned char*)string; + while ((c = *start)) { + if (trailing) { + if ((c & 0xC0) == 0x80) { + trailing--; + start++; + length++; + continue; + } else { + length = -1; + break; + } + } + if ((c & 0x80) == 0) { + start++; + length++; + continue; + } + if ((c & 0xE0) == 0xC0) { + trailing = 1; + start++; + length++; + continue; + } + if ((c & 0xF0) == 0xE0) { + trailing = 2; + start++; + length++; + continue; + } + if ((c & 0xF8) == 0xF0) { + trailing = 3; + start++; + length++; + continue; + } + length = -1; + break; + } + return length; +} + +/* +** Return values of columns for the row at which the VsvCursor +** is currently pointing. +*/ +static int vsvtabColumn(sqlite3_vtab_cursor* cur, /* The cursor */ + sqlite3_context* ctx, /* First argument to sqlite3_result_...() */ + int i /* Which column to return */ +) { + VsvCursor* pCur = (VsvCursor*)cur; + VsvTable* pTab = (VsvTable*)cur->pVtab; + long long dLen = pCur->dLen[i]; + long long length = 0; + + if (i >= 0 && i < pTab->nCol && pCur->azVal[i] != 0 && dLen > -1) { + switch (pTab->affinity) { + case 0: { + if (pTab->validateUTF8) { + length = vsv_utf8IsValid(pCur->azVal[i]); + if (length == dLen) { + sqlite3_result_text(ctx, pCur->azVal[i], dLen, SQLITE_TRANSIENT); + } else { + sqlite3_result_error(ctx, "Invalid UTF8 Data", -1); + } + } else { + sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT); + } + break; + } + case 1: { + sqlite3_result_blob(ctx, pCur->azVal[i], dLen, SQLITE_TRANSIENT); + break; + } + case 2: { + if (pTab->validateUTF8) { + length = vsv_utf8IsValid(pCur->azVal[i]); + if (length < dLen) { + sqlite3_result_blob(ctx, pCur->azVal[i], dLen, SQLITE_TRANSIENT); + } else { + sqlite3_result_text(ctx, pCur->azVal[i], length, SQLITE_TRANSIENT); + } + } else { + sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT); + } + break; + } + case 3: { + switch (vsv_isValidNumber(pCur->rdr.dsep, pCur->azVal[i])) { + case 1: { + sqlite3_result_int64(ctx, strtoll(pCur->azVal[i], 0, 10)); + break; + } + default: { + if (pTab->validateUTF8) { + length = vsv_utf8IsValid(pCur->azVal[i]); + if (length < dLen) { + sqlite3_result_blob(ctx, pCur->azVal[i], dLen, SQLITE_TRANSIENT); + } else { + sqlite3_result_text(ctx, pCur->azVal[i], length, SQLITE_TRANSIENT); + } + } else { + sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT); + } + break; + } + } + break; + } + case 4: { + switch (vsv_isValidNumber(pCur->rdr.dsep, pCur->azVal[i])) { + case 1: + case 2: { + sqlite3_result_double(ctx, strtod(pCur->azVal[i], 0)); + break; + } + default: { + if (pTab->validateUTF8) { + length = vsv_utf8IsValid(pCur->azVal[i]); + if (length < dLen) { + sqlite3_result_blob(ctx, pCur->azVal[i], dLen, SQLITE_TRANSIENT); + } else { + sqlite3_result_text(ctx, pCur->azVal[i], length, SQLITE_TRANSIENT); + } + } else { + sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT); + } + break; + } + } + break; + } + case 5: { + switch (vsv_isValidNumber(pCur->rdr.dsep, pCur->azVal[i])) { + case 1: { + sqlite3_result_int64(ctx, strtoll(pCur->azVal[i], 0, 10)); + break; + } + case 2: { + long double dv, fp, ip; + + dv = strtold(pCur->azVal[i], 0); + fp = modfl(dv, &ip); + if (sizeof(long double) > sizeof(double)) { + if (fp == 0.0L && dv >= -9223372036854775808.0L && + dv <= 9223372036854775807.0L) { + sqlite3_result_int64(ctx, (long long)dv); + } else { + sqlite3_result_double(ctx, (double)dv); + } + } else { + // Only convert if it will fit in a 6-byte varint + if (fp == 0.0L && dv >= -140737488355328.0L && + dv <= 140737488355328.0L) { + sqlite3_result_int64(ctx, (long long)dv); + } else { + sqlite3_result_double(ctx, (double)dv); + } + } + break; + } + default: { + if (pTab->validateUTF8) { + length = vsv_utf8IsValid(pCur->azVal[i]); + if (length < dLen) { + sqlite3_result_blob(ctx, pCur->azVal[i], dLen, SQLITE_TRANSIENT); + } else { + sqlite3_result_text(ctx, pCur->azVal[i], length, SQLITE_TRANSIENT); + } + } else { + sqlite3_result_text(ctx, pCur->azVal[i], -1, SQLITE_TRANSIENT); + } + break; + } + } + } + } + } + return SQLITE_OK; +} + +/* +** Return the rowid for the current row. +*/ +static int vsvtabRowid(sqlite3_vtab_cursor* cur, sqlite_int64* pRowid) { + VsvCursor* pCur = (VsvCursor*)cur; + *pRowid = pCur->iRowid; + return SQLITE_OK; +} + +/* +** Return TRUE if the cursor has been moved off of the last +** row of output. +*/ +static int vsvtabEof(sqlite3_vtab_cursor* cur) { + VsvCursor* pCur = (VsvCursor*)cur; + return pCur->iRowid < 0; +} + +/* +** Only a full table scan is supported. So xFilter simply rewinds to +** the beginning. +*/ +static int vsvtabFilter(sqlite3_vtab_cursor* pVtabCursor, + int idxNum, + const char* idxStr, + int argc, + sqlite3_value** argv) { + VsvCursor* pCur = (VsvCursor*)pVtabCursor; + VsvTable* pTab = (VsvTable*)pVtabCursor->pVtab; + pCur->iRowid = 0; + if (pCur->rdr.in == 0) { + assert(pCur->rdr.zIn == pTab->zData); + assert(pTab->iStart >= 0); + assert((size_t)pTab->iStart <= pCur->rdr.nIn); + pCur->rdr.iIn = pTab->iStart; + } else { + fseek(pCur->rdr.in, pTab->iStart, SEEK_SET); + pCur->rdr.iIn = 0; + pCur->rdr.nIn = 0; + } + return vsvtabNext(pVtabCursor); +} + +/* +** Only a forward full table scan is supported. xBestIndex is mostly +** a no-op. +*/ +static int vsvtabBestIndex(sqlite3_vtab* tab, sqlite3_index_info* pIdxInfo) { + pIdxInfo->estimatedCost = 1000000; + return SQLITE_OK; +} + +static sqlite3_module vsv_module = { + .xCreate = vsvtabCreate, + .xConnect = vsvtabConnect, + .xBestIndex = vsvtabBestIndex, + .xDisconnect = vsvtabDisconnect, + .xDestroy = vsvtabDisconnect, + .xOpen = vsvtabOpen, + .xClose = vsvtabClose, + .xFilter = vsvtabFilter, + .xNext = vsvtabNext, + .xEof = vsvtabEof, + .xColumn = vsvtabColumn, + .xRowid = vsvtabRowid, +}; + +int vsv_init(sqlite3* db) { + sqlite3_create_module(db, "vsv", &vsv_module, 0); + return SQLITE_OK; +} diff --git a/libsql-ffi/bundled/sqlean/vsv/extension.h b/libsql-ffi/bundled/sqlean/vsv/extension.h new file mode 100644 index 0000000000..b73acb679c --- /dev/null +++ b/libsql-ffi/bundled/sqlean/vsv/extension.h @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// CSV files as virtual tables in SQLite + +#ifndef VSV_EXTENSION_H +#define VSV_EXTENSION_H + +#include "sqlite3ext.h" + +int vsv_init(sqlite3* db); + +#endif /* VSV_EXTENSION_H */ diff --git a/libsql-ffi/bundled/src/sqlean.c b/libsql-ffi/bundled/src/sqlean.c new file mode 100644 index 0000000000..603e852413 --- /dev/null +++ b/libsql-ffi/bundled/src/sqlean.c @@ -0,0 +1,7 @@ +#include "sqlite3.c" + +extern int sqlite3_sqlean_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api); + +int core_init(const char* dummy) { + return sqlite3_auto_extension((void*)sqlite3_sqlean_init); +} diff --git a/libsql-ffi/bundled/src/sqlite3-sqlean-stripped.c b/libsql-ffi/bundled/src/sqlite3-sqlean-stripped.c new file mode 100755 index 0000000000..ae01ad8763 --- /dev/null +++ b/libsql-ffi/bundled/src/sqlite3-sqlean-stripped.c @@ -0,0 +1,48 @@ +// Copyright (c) 2023 Anton Zhiyanov, MIT License +// https://github.com/nalgeon/sqlean + +// Sqlean extensions bundle. + +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +// include most of the extensions, +#include "crypto/extension.h" +#include "define/extension.h" +#include "fileio/extension.h" +#include "fuzzy/extension.h" +#if !defined(_WIN32) +#include "ipaddr/extension.h" +#endif +#include "math/extension.h" +#include "regexp/extension.h" +#include "stats/extension.h" +#include "text/extension.h" +#include "time/extension.h" +#include "unicode/extension.h" +#include "uuid/extension.h" +#include "vsv/extension.h" + +#include "sqlean.h" + +// Returns the current Sqlean version. +static void sqlean_version(sqlite3_context* context, int argc, sqlite3_value** argv) { + sqlite3_result_text(context, SQLEAN_VERSION, -1, SQLITE_STATIC); +} + +#ifdef _WIN32 +__declspec(dllexport) +#endif + int sqlite3_sqlean_init(sqlite3* db, char** errmsg_ptr, const sqlite3_api_routines* api) { + (void)errmsg_ptr; + SQLITE_EXTENSION_INIT2(api); + static const int flags = SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC; + sqlite3_create_function(db, "sqlean_version", 0, flags, 0, sqlean_version, 0, 0); + crypto_init(db); + fuzzy_init(db); + math_init(db); + stats_init(db); + text_init(db); + uuid_init(db); + return SQLITE_OK; +} diff --git a/libsql-server/Cargo.toml b/libsql-server/Cargo.toml index 6a22910548..f5484f2c95 100644 --- a/libsql-server/Cargo.toml +++ b/libsql-server/Cargo.toml @@ -64,7 +64,7 @@ serde_json = { version = "1.0.91", features = ["preserve_order"] } md-5 = "0.10" sha2 = "0.10" sha256 = "1.1.3" -libsql-sys = { path = "../libsql-sys", features = ["wal"], default-features = false } +libsql-sys = { path = "../libsql-sys", features = ["wal", "sqlean-extensions" ], default-features = false } libsql-hrana = { path = "../libsql-hrana" } sqlite3-parser = { package = "libsql-sqlite3-parser", path = "../vendored/sqlite3-parser", default-features = false, features = [ "YYNOERRORRECOVERY" ] } tempfile = "3.7.0" diff --git a/libsql-sys/Cargo.toml b/libsql-sys/Cargo.toml index 10b7b54075..d2ab9554b0 100644 --- a/libsql-sys/Cargo.toml +++ b/libsql-sys/Cargo.toml @@ -26,4 +26,5 @@ rusqlite = ["dep:rusqlite"] wasmtime-bindings = ["libsql-ffi/wasmtime-bindings"] unix-excl-vfs = [] encryption = ["libsql-ffi/multiple-ciphers"] +sqlean-extensions = ["libsql-ffi/sqlean-extensions"]