From c3c71b792a76782ec6e98ab136c7ccd52028f1e6 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Fri, 20 Dec 2024 14:20:27 -0800 Subject: [PATCH 1/5] Implement Variant encoding --- parquet-variant/pom.xml | 88 +++ .../variant/MalformedVariantException.java | 23 + .../variant/UnknownVariantTypeException.java | 39 ++ .../org/apache/parquet/variant/Variant.java | 446 ++++++++++++ .../parquet/variant/VariantBuilder.java | 631 +++++++++++++++++ .../variant/VariantDuplicateKeyException.java | 39 ++ .../variant/VariantSizeLimitException.java | 24 + .../apache/parquet/variant/VariantUtil.java | 646 ++++++++++++++++++ .../parquet/variant/TestVariantEncoding.java | 490 +++++++++++++ pom.xml | 1 + 10 files changed, 2427 insertions(+) create mode 100644 parquet-variant/pom.xml create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java create mode 100644 parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java create mode 100644 parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java diff --git a/parquet-variant/pom.xml b/parquet-variant/pom.xml new file mode 100644 index 0000000000..6bfc2ff525 --- /dev/null +++ b/parquet-variant/pom.xml @@ -0,0 +1,88 @@ + + + + org.apache.parquet + parquet + ../pom.xml + 1.16.0-SNAPSHOT + + + 4.0.0 + + parquet-variant + jar + + Apache Parquet Variant + https://parquet.apache.org + + + + + + + org.apache.parquet + parquet-jackson + ${project.version} + runtime + + + ${jackson.groupId} + jackson-core + ${jackson.version} + + + ${jackson.groupId} + jackson-databind + ${jackson-databind.version} + test + + + com.google.guava + guava + ${guava.version} + test + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + test + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + org.apache.maven.plugins + maven-shade-plugin + + + + + diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java new file mode 100644 index 0000000000..e9bff469d2 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the Variant is malformed. + */ +public class MalformedVariantException extends RuntimeException { +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java new file mode 100644 index 0000000000..2f0bd5dce6 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the Variant contains an unknown type. + */ +public class UnknownVariantTypeException extends RuntimeException { + public final int typeId; + + /** + * @param typeId the type id that was unknown + */ + public UnknownVariantTypeException(int typeId) { + super("Unknown type in Variant. id: " + typeId); + this.typeId = typeId; + } + + /** + * @return the type id that was unknown + */ + public int getTypeId() { + return typeId; + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java new file mode 100644 index 0000000000..4fcdb6b0e5 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -0,0 +1,446 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.variant; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.Base64; +import java.util.Locale; + +import static java.time.temporal.ChronoField.*; +import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; +import static org.apache.parquet.variant.VariantUtil.*; + +/** + * This Variant class holds the Variant-encoded value and metadata binary values. + */ +public final class Variant { + final byte[] value; + final byte[] metadata; + /** + * The starting index into `value` where the variant value starts. This is used to avoid copying + * the value binary when reading a sub-variant in the array/object element. + */ + final int pos; + + public Variant(byte[] value, byte[] metadata) { + this(value, metadata, 0); + } + + Variant(byte[] value, byte[] metadata, int pos) { + this.value = value; + this.metadata = metadata; + this.pos = pos; + // There is currently only one allowed version. + if (metadata.length < 1 || (metadata[0] & VERSION_MASK) != VERSION) { + throw malformedVariant(); + } + } + + public byte[] getValue() { + if (pos == 0) return value; + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + return Arrays.copyOfRange(value, pos, pos + size); + } + + public byte[] getMetadata() { + return metadata; + } + + /** + * @return the boolean value + */ + public boolean getBoolean() { + return VariantUtil.getBoolean(value, pos); + } + + /** + * @return the long value + */ + public long getLong() { + return VariantUtil.getLong(value, pos); + } + + /** + * @return the double value + */ + public double getDouble() { + return VariantUtil.getDouble(value, pos); + } + + /** + * @return the decimal value + */ + public BigDecimal getDecimal() { + return VariantUtil.getDecimal(value, pos); + } + + /** + * @return the float value + */ + public float getFloat() { + return VariantUtil.getFloat(value, pos); + } + + /** + * @return the binary value + */ + public byte[] getBinary() { + return VariantUtil.getBinary(value, pos); + } + + /** + * @return the string value + */ + public String getString() { + return VariantUtil.getString(value, pos); + } + + /** + * @return the type info bits from a variant value + */ + public int getTypeInfo() { + return VariantUtil.getTypeInfo(value, pos); + } + + /** + * @return the type of the variant value + */ + public Type getType() { + return VariantUtil.getType(value, pos); + } + + /** + * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`. + */ + public int objectSize() { + return handleObject(value, pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); + } + + // Find the field value whose key is equal to `key`. Return null if the key is not found. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + + /** + * Returns the object field Variant value whose key is equal to `key`. + * Return null if the key is not found. `getType()` must be `Type.OBJECT`. + * @param key the key to look up + * @return the field value whose key is equal to `key`, or null if key is not found + */ + public Variant getFieldByKey(String key) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + // Use linear search for a short list. Switch to binary search when the length reaches + // `BINARY_SEARCH_THRESHOLD`. + final int BINARY_SEARCH_THRESHOLD = 32; + if (size < BINARY_SEARCH_THRESHOLD) { + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + if (key.equals(getMetadataKey(metadata, id))) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + } + } else { + int low = 0; + int high = size - 1; + while (low <= high) { + // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a + // performance optimization, because it can properly handle the case where `low + high` + // overflows int. + int mid = (low + high) >>> 1; + int id = readUnsigned(value, idStart + idSize * mid, idSize); + int cmp = getMetadataKey(metadata, id).compareTo(key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + int offset = readUnsigned(value, offsetStart + offsetSize * mid, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + } + } + return null; + }); + } + + /** + * A field in a Variant object. + */ + public static final class ObjectField { + public final String key; + public final Variant value; + + public ObjectField(String key, Variant value) { + this.key = key; + this.value = value; + } + } + + // Get the object field at the `index` slot. Return null if `index` is out of the bound of + // `[0, objectSize())`. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + /** + * Returns the object field at the `index` slot. Return null if `index` is out of the bound of + * `[0, objectSize())`. `getType()` must be `Type.OBJECT`. + * @param index the index of the object field to get + * @return the Objectfield at the `index` slot, or null if `index` is out of bounds + */ + public ObjectField getFieldAtIndex(int index) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) return null; + int id = readUnsigned(value, idStart + idSize * index, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + String key = getMetadataKey(metadata, id); + Variant v = new Variant(value, metadata, dataStart + offset); + return new ObjectField(key, v); + }); + } + + /** + * Returns the dictionary ID for the object field at the `index` slot. + * `getType()` must be `Type.OBJECT`. + * @param index the index of the object field to get the dictionary ID for + * @return the dictionary ID for the object field at the `index` slot + * @throws MalformedVariantException if `index` is out of bounds + */ + public int getDictionaryIdAtIndex(int index) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) { + throw malformedVariant(); + } + return readUnsigned(value, idStart + idSize * index, idSize); + }); + } + + /** + * @return the number of array elements. `getType()` must be `Type.ARRAY`. + */ + public int arraySize() { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); + } + + /** + * Returns the array element Variant value at the `index` slot. Returns null if `index` is + * out of the bound of `[0, arraySize())`. `getType()` must be `Type.ARRAY`. + * @param index the index of the array element to get + * @return the array element Variant at the `index` slot, or null if `index` is out of bounds + */ + public Variant getElementAtIndex(int index) { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + if (index < 0 || index >= size) return null; + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + return new Variant(value, metadata, dataStart + offset); + }); + } + + /** + * @param zoneId The ZoneId to use for formatting timestamps + * @return the JSON representation of the variant + * @throws MalformedVariantException if the variant is malformed + */ + public String toJson(ZoneId zoneId) { + return toJson(zoneId, false); + } + + /** + * @param zoneId The ZoneId to use for formatting timestamps + * @param truncateTrailingZeros Whether to truncate trailing zeros in decimal values or timestamps + * @return the JSON representation of the variant + * @throws MalformedVariantException if the variant is malformed + */ + public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) { + StringBuilder sb = new StringBuilder(); + toJsonImpl(value, metadata, pos, sb, zoneId, truncateTrailingZeros); + return sb.toString(); + } + + /** + * Escapes a string so that it can be pasted into a JSON structure. For example, if `str` + * only contains a new-line character, then the result is "\n" (4 characters) + * @param str the string to escape + * @return the escaped string + */ + private static String escapeJson(String str) { + try (CharArrayWriter writer = new CharArrayWriter(); + JsonGenerator gen = new JsonFactory().createGenerator(writer)) { + gen.writeString(str); + gen.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + // A simplified and more performant version of `sb.append(escapeJson(str))`. It is used when we + // know `str` doesn't contain any special character that needs escaping. + /** + * Appends a quoted string to a StringBuilder. It is used when we know `str` doesn't contain any + * special characters that needs escaping. This is more performant than + * `sb.append(escapeJson(str))`. + * @param sb the StringBuilder to append to + * @param str the string to append + */ + private static void appendQuoted(StringBuilder sb, String str) { + sb.append('"'); + sb.append(str); + sb.append('"'); + } + + /** The format for a timestamp without time zone. */ + private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendValue(HOUR_OF_DAY, 2) + .appendLiteral(':') + .appendValue(MINUTE_OF_HOUR, 2) + .optionalStart() + .appendLiteral(':') + .appendValue(SECOND_OF_MINUTE, 2) + .appendFraction(MICRO_OF_SECOND, 6, 6, true) + .toFormatter(Locale.US); + + /** The format for a timestamp with time zone. */ + private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + /** The format for a timestamp without time zone, truncating trailing microsecond zeros. */ + private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER = + new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral('T') + .appendValue(HOUR_OF_DAY, 2) + .appendLiteral(':') + .appendValue(MINUTE_OF_HOUR, 2) + .optionalStart() + .appendLiteral(':') + .appendValue(SECOND_OF_MINUTE, 2) + .optionalStart() + .appendFraction(MICRO_OF_SECOND, 0, 6, true) + .toFormatter(Locale.US); + + /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ + private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_TRUNC_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + private static Instant microsToInstant(long timestamp) { + return Instant.EPOCH.plus(timestamp, ChronoUnit.MICROS); + } + + private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb, + ZoneId zoneId, boolean truncateTrailingZeros) { + switch (VariantUtil.getType(value, pos)) { + case OBJECT: + handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + sb.append('{'); + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) sb.append(','); + sb.append(escapeJson(getMetadataKey(metadata, id))); + sb.append(':'); + toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros); + } + sb.append('}'); + return null; + }); + break; + case ARRAY: + handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + sb.append('['); + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) sb.append(','); + toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros); + } + sb.append(']'); + return null; + }); + break; + case NULL: + sb.append("null"); + break; + case BOOLEAN: + sb.append(VariantUtil.getBoolean(value, pos)); + break; + case LONG: + sb.append(VariantUtil.getLong(value, pos)); + break; + case STRING: + sb.append(escapeJson(VariantUtil.getString(value, pos))); + break; + case DOUBLE: + sb.append(VariantUtil.getDouble(value, pos)); + break; + case DECIMAL: + if (truncateTrailingZeros) { + sb.append(VariantUtil.getDecimal(value, pos).stripTrailingZeros().toPlainString()); + } else { + sb.append(VariantUtil.getDecimal(value, pos).toPlainString()); + } + break; + case DATE: + appendQuoted(sb, LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)).toString()); + break; + case TIMESTAMP: + if (truncateTrailingZeros) { + appendQuoted(sb, TIMESTAMP_TRUNC_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + } else { + appendQuoted(sb, TIMESTAMP_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + } + break; + case TIMESTAMP_NTZ: + if (truncateTrailingZeros) { + appendQuoted(sb, TIMESTAMP_NTZ_TRUNC_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + } else { + appendQuoted(sb, TIMESTAMP_NTZ_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + } + break; + case FLOAT: + sb.append(VariantUtil.getFloat(value, pos)); + break; + case BINARY: + appendQuoted(sb, Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); + break; + } + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java new file mode 100644 index 0000000000..574c8fdbde --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -0,0 +1,631 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.exc.InputCoercionException; + +import static org.apache.parquet.variant.VariantUtil.*; + +/** + * Builder for creating Variant value and metadata. + */ +public class VariantBuilder { + public VariantBuilder(boolean allowDuplicateKeys) { + this(allowDuplicateKeys, DEFAULT_SIZE_LIMIT); + } + + public VariantBuilder(boolean allowDuplicateKeys, int sizeLimitBytes) { + this.allowDuplicateKeys = allowDuplicateKeys; + this.sizeLimitBytes = sizeLimitBytes; + } + + /** + * Parse a JSON string as a Variant value. + * @param json the JSON string to parse + * @return the Variant value + * @throws IOException if any JSON parsing error happens + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public static Variant parseJson(String json) throws IOException { + return parseJson(json, new VariantBuilder(false)); + } + + /** + * Parse a JSON string as a Variant value. + * @param json the JSON string to parse + * @param builder the VariantBuilder to use for building the Variant + * @return the Variant value + * @throws IOException if any JSON parsing error happens + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public static Variant parseJson(String json, VariantBuilder builder) throws IOException { + try (JsonParser parser = new JsonFactory().createParser(json)) { + parser.nextToken(); + return parseJson(parser, builder); + } + } + + /** + * Parse a JSON parser as a Variant value. + * @param parser the JSON parser to use + * @param builder the VariantBuilder to use for building the Variant + * @return the Variant value + * @throws IOException if any JSON parsing error happens + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public static Variant parseJson(JsonParser parser, VariantBuilder builder) + throws IOException { + builder.buildFromJsonParser(parser); + return builder.result(); + } + + /** + * @return the Variant value + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the size limit + */ + public Variant result() { + int numKeys = dictionaryKeys.size(); + // Use long to avoid overflow in accumulating lengths. + long dictionaryStringSize = 0; + for (byte[] key : dictionaryKeys) { + dictionaryStringSize += key.length; + } + // Determine the number of bytes required per offset entry. + // The largest offset is the one-past-the-end value, which is total string size. It's very + // unlikely that the number of keys could be larger, but incorporate that into the calculation + // in case of pathological data. + long maxSize = Math.max(dictionaryStringSize, numKeys); + if (maxSize > sizeLimitBytes) { + throw new VariantSizeLimitException(); + } + int offsetSize = getMinIntegerSize((int)maxSize); + + int offsetStart = 1 + offsetSize; + int stringStart = offsetStart + (numKeys + 1) * offsetSize; + long metadataSize = stringStart + dictionaryStringSize; + + if (metadataSize > sizeLimitBytes) { + throw new VariantSizeLimitException(); + } + byte[] metadata = new byte[(int) metadataSize]; + int headerByte = VERSION | ((offsetSize - 1) << 6); + writeLong(metadata, 0, headerByte, 1); + writeLong(metadata, 1, numKeys, offsetSize); + int currentOffset = 0; + for (int i = 0; i < numKeys; ++i) { + writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize); + byte[] key = dictionaryKeys.get(i); + System.arraycopy(key, 0, metadata, stringStart + currentOffset, key.length); + currentOffset += key.length; + } + writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize); + return new Variant(Arrays.copyOfRange(writeBuffer, 0, writePos), metadata); + } + + public void appendString(String str) { + byte[] text = str.getBytes(StandardCharsets.UTF_8); + boolean longStr = text.length > MAX_SHORT_STR_SIZE; + checkCapacity((longStr ? 1 + U32_SIZE : 1) + text.length); + if (longStr) { + writeBuffer[writePos++] = primitiveHeader(LONG_STR); + writeLong(writeBuffer, writePos, text.length, U32_SIZE); + writePos += U32_SIZE; + } else { + writeBuffer[writePos++] = shortStrHeader(text.length); + } + System.arraycopy(text, 0, writeBuffer, writePos, text.length); + writePos += text.length; + } + + public void appendNull() { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(NULL); + } + + public void appendBoolean(boolean b) { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(b ? TRUE : FALSE); + } + + /** + * Appends a long value to the variant builder. The actual encoded integer type depends on the + * value range of the long value. + * @param l the long value to append + */ + public void appendLong(long l) { + checkCapacity(1 + 8); + if (l == (byte) l) { + writeBuffer[writePos++] = primitiveHeader(INT1); + writeLong(writeBuffer, writePos, l, 1); + writePos += 1; + } else if (l == (short) l) { + writeBuffer[writePos++] = primitiveHeader(INT2); + writeLong(writeBuffer, writePos, l, 2); + writePos += 2; + } else if (l == (int) l) { + writeBuffer[writePos++] = primitiveHeader(INT4); + writeLong(writeBuffer, writePos, l, 4); + writePos += 4; + } else { + writeBuffer[writePos++] = primitiveHeader(INT8); + writeLong(writeBuffer, writePos, l, 8); + writePos += 8; + } + } + + public void appendDouble(double d) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(DOUBLE); + writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8); + writePos += 8; + } + + /** + * Appends a decimal value to the variant builder. The actual encoded decimal type depends on the + * precision and scale of the decimal value. + * @param d the decimal value to append + */ + public void appendDecimal(BigDecimal d) { + checkCapacity(2 + 16); + BigInteger unscaled = d.unscaledValue(); + if (d.scale() <= MAX_DECIMAL4_PRECISION && d.precision() <= MAX_DECIMAL4_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL4); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4); + writePos += 4; + } else if (d.scale() <= MAX_DECIMAL8_PRECISION && d.precision() <= MAX_DECIMAL8_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL8); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8); + writePos += 8; + } else { + assert d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION; + writeBuffer[writePos++] = primitiveHeader(DECIMAL16); + writeBuffer[writePos++] = (byte) d.scale(); + // `toByteArray` returns a big-endian representation. We need to copy it reversely and sign + // extend it to 16 bytes. + byte[] bytes = unscaled.toByteArray(); + for (int i = 0; i < bytes.length; ++i) { + writeBuffer[writePos + i] = bytes[bytes.length - 1 - i]; + } + byte sign = (byte) (bytes[0] < 0 ? -1 : 0); + for (int i = bytes.length; i < 16; ++i) { + writeBuffer[writePos + i] = sign; + } + writePos += 16; + } + } + + public void appendDate(int daysSinceEpoch) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(DATE); + writeLong(writeBuffer, writePos, daysSinceEpoch, 4); + writePos += 4; + } + + public void appendTimestamp(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendTimestampNtz(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP_NTZ); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendFloat(float f) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(FLOAT); + writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 8); + writePos += 4; + } + + public void appendBinary(byte[] binary) { + checkCapacity(1 + U32_SIZE + binary.length); + writeBuffer[writePos++] = primitiveHeader(BINARY); + writeLong(writeBuffer, writePos, binary.length, U32_SIZE); + writePos += U32_SIZE; + System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); + writePos += binary.length; + } + + /** + * Adds a key to the Variant dictionary. If the key already exists, the dictionary is unmodified. + * @param key the key to add + * @return the id of the key + */ + public int addKey(String key) { + int id; + if (dictionary.containsKey(key)) { + id = dictionary.get(key); + } else { + id = dictionaryKeys.size(); + dictionary.put(key, id); + dictionaryKeys.add(key.getBytes(StandardCharsets.UTF_8)); + } + return id; + } + + /** + * @return the current write position of the variant builder + */ + public int getWritePos() { + return writePos; + } + + // Finish writing a variant object after all of its fields have already been written. The process + // is as follows: + // 1. The caller calls `getWritePos` before writing any fields to obtain the `start` parameter. + // 2. The caller appends all the object fields to the builder. In the meantime, it should maintain + // the `fields` parameter. Before appending each field, it should append an entry to `fields` to + // record the offset of the field. The offset is computed as `getWritePos() - start`. + // 3. The caller calls `finishWritingObject` to finish writing a variant object. + // + // This function is responsible to sort the fields by key. If there are duplicate field keys: + // - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last + // appended one) is kept. + // - otherwise, throw an exception. + /** + * Finish writing a Variant object after all of its fields have already been written. The process + * is as follows: + * 1. The caller calls `getWritePos()` before writing any fields to obtain the `start` parameter. + * 2. The caller appends all the object fields to the builder. In the meantime, it should maintain + * the `fields` parameter. Before appending each field, it should append an entry to `fields` to + * record the offset of the field. The offset is computed as `getWritePos() - start`. + * 3. The caller calls `finishWritingObject` to finish writing the Variant object. + * + * This method will sort the fields by key. If there are duplicate field keys: + * - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last + * appended one) is kept. + * - otherwise, throw an exception. + * @param start the start position of the object in the write buffer + * @param fields the list of `FieldEntry` in the object + * @throws VariantDuplicateKeyException if there are duplicate keys and `allowDuplicateKeys` is + * false + */ + public void finishWritingObject(int start, ArrayList fields) { + int size = fields.size(); + Collections.sort(fields); + int maxId = size == 0 ? 0 : fields.get(0).id; + if (allowDuplicateKeys) { + int distinctPos = 0; + // Maintain a list of distinct keys in-place. + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + if (fields.get(i).id == fields.get(i - 1).id) { + // Found a duplicate key. Keep the field with the greater offset. + if (fields.get(distinctPos).offset < fields.get(i).offset) { + fields.set(distinctPos, fields.get(distinctPos).withNewOffset(fields.get(i).offset)); + } + } else { + // Found a distinct key. Add the field to the list. + ++distinctPos; + fields.set(distinctPos, fields.get(i)); + } + } + if (distinctPos + 1 < fields.size()) { + size = distinctPos + 1; + // Resize `fields` to `size`. + fields.subList(size, fields.size()).clear(); + // Sort the fields by offsets so that we can move the value data of each field to the new + // offset without overwriting the fields after it. + fields.sort(Comparator.comparingInt(f -> f.offset)); + int currentOffset = 0; + for (int i = 0; i < size; ++i) { + int oldOffset = fields.get(i).offset; + int fieldSize = VariantUtil.valueSize(writeBuffer, start + oldOffset); + System.arraycopy(writeBuffer, start + oldOffset, + writeBuffer, start + currentOffset, fieldSize); + fields.set(i, fields.get(i).withNewOffset(currentOffset)); + currentOffset += fieldSize; + } + writePos = start + currentOffset; + // Change back to the sort order by field keys, required by the Variant specification. + Collections.sort(fields); + } + } else { + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + String key = fields.get(i).key; + if (key.equals(fields.get(i - 1).key)) { + throw new VariantDuplicateKeyException(key); + } + } + } + int dataSize = writePos - start; + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int idSize = getMinIntegerSize(maxId); + int offsetSize = getMinIntegerSize(dataSize); + // The space for header byte, object size, id list, and offset list. + int headerSize = 1 + sizeBytes + size * idSize + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the object header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = objectHeader(largeSize, idSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int idStart = start + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize); + writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + /** + * Finish writing a Variant array after all of its elements have already been written. The process + * is similar to that of `finishWritingObject`. + * @param start the start position of the array in the write buffer + * @param offsets the list of offsets of the array elements + */ + public void finishWritingArray(int start, ArrayList offsets) { + int dataSize = writePos - start; + int size = offsets.size(); + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int offsetSize = getMinIntegerSize(dataSize); + // The space for header byte, object size, and offset list. + int headerSize = 1 + sizeBytes + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = arrayHeader(largeSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int offsetStart = start + 1 + sizeBytes; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + /** + * Appends a Variant value to the Variant builder. The input Variant keys must be inserted into + * the builder dictionary and rebuilt with new field ids. For scalar values in the input + * Variant, we can directly copy the binary slice. + * @param v the Variant value to append + */ + public void appendVariant(Variant v) { + appendVariantImpl(v.value, v.metadata, v.pos); + } + + private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + switch (basicType) { + case OBJECT: + handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + ArrayList fields = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + String key = getMetadataKey(metadata, id); + int newId = addKey(key); + fields.add(new FieldEntry(key, newId, writePos - start)); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingObject(start, fields); + return null; + }); + break; + case ARRAY: + handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + ArrayList offsets = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + offsets.add(writePos - start); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingArray(start, offsets); + return null; + }); + break; + default: + shallowAppendVariantImpl(value, pos); + break; + } + } + + private void shallowAppendVariantImpl(byte[] value, int pos) { + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + checkCapacity(size); + System.arraycopy(value, pos, writeBuffer, writePos, size); + writePos += size; + } + + private void checkCapacity(int additionalBytes) { + int requiredBytes = writePos + additionalBytes; + if (requiredBytes > writeBuffer.length) { + // Allocate a new buffer with a capacity of the next power of 2 of `requiredBytes`. + int newCapacity = Integer.highestOneBit(requiredBytes); + newCapacity = newCapacity < requiredBytes ? newCapacity * 2 : newCapacity; + if (newCapacity > sizeLimitBytes) { + throw new VariantSizeLimitException(); + } + byte[] newValue = new byte[newCapacity]; + System.arraycopy(writeBuffer, 0, newValue, 0, writePos); + writeBuffer = newValue; + } + } + + // Temporarily store the information of a field. We need to collect all fields in an JSON object, + // sort them by their keys, and build the variant object in sorted order. + + /** + * Class to store the information of a Variant object field. We need to collect all fields of + * an object, sort them by their keys, and build the Variant object in sorted order. + */ + public static final class FieldEntry implements Comparable { + final String key; + final int id; + final int offset; + + public FieldEntry(String key, int id, int offset) { + this.key = key; + this.id = id; + this.offset = offset; + } + + FieldEntry withNewOffset(int newOffset) { + return new FieldEntry(key, id, newOffset); + } + + @Override + public int compareTo(FieldEntry other) { + return key.compareTo(other.key); + } + } + + private void buildFromJsonParser(JsonParser parser) throws IOException { + JsonToken token = parser.currentToken(); + if (token == null) { + throw new JsonParseException(parser, "Unexpected null token"); + } + switch (token) { + case START_OBJECT: { + ArrayList fields = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_OBJECT) { + String key = parser.currentName(); + parser.nextToken(); + int id = addKey(key); + fields.add(new FieldEntry(key, id, writePos - start)); + buildFromJsonParser(parser); + } + finishWritingObject(start, fields); + break; + } + case START_ARRAY: { + ArrayList offsets = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_ARRAY) { + offsets.add(writePos - start); + buildFromJsonParser(parser); + } + finishWritingArray(start, offsets); + break; + } + case VALUE_STRING: + appendString(parser.getText()); + break; + case VALUE_NUMBER_INT: + try { + appendLong(parser.getLongValue()); + } catch (InputCoercionException ignored) { + // If the value doesn't fit any integer type, parse it as decimal or floating instead. + parseAndAppendFloatingPoint(parser); + } + break; + case VALUE_NUMBER_FLOAT: + parseAndAppendFloatingPoint(parser); + break; + case VALUE_TRUE: + appendBoolean(true); + break; + case VALUE_FALSE: + appendBoolean(false); + break; + case VALUE_NULL: + appendNull(); + break; + default: + throw new JsonParseException(parser, "Unexpected token " + token); + } + } + + /** + * Returns the size (number of bytes) of the smallest unsigned integer type that can store + * `value`. It must be within `[0, U24_MAX]`. + * @param value the value to get the size for + * @return the size (number of bytes) of the smallest unsigned integer type that can store `value` + */ + private int getMinIntegerSize(int value) { + assert value >= 0 && value <= U24_MAX; + if (value <= U8_MAX) return 1; + if (value <= U16_MAX) return 2; + return U24_SIZE; + } + + /** + * Parse a JSON number as a floating point value. If the number can be parsed as a decimal, it + * will be appended as a decimal value. Otherwise, it will be appended as a double value. + * @param parser the JSON parser to use + */ + private void parseAndAppendFloatingPoint(JsonParser parser) throws IOException { + if (!tryParseDecimal(parser.getText())) { + appendDouble(parser.getDoubleValue()); + } + } + + /** + * Try to parse a JSON number as a decimal. The input must only use the decimal format + * (an integer value with an optional '.' in it) and must not use scientific notation. It also + * must fit into the precision limitation of decimal types. + * @param input the input string to parse as decimal + * @return whether the parsing succeeds + */ + private boolean tryParseDecimal(String input) { + for (int i = 0; i < input.length(); ++i) { + char ch = input.charAt(i); + if (ch != '-' && ch != '.' && !(ch >= '0' && ch <= '9')) { + return false; + } + } + BigDecimal d = new BigDecimal(input); + if (d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION) { + appendDecimal(d); + return true; + } + return false; + } + + /** The buffer for building the Variant value. The first `writePos` bytes have been written. */ + private byte[] writeBuffer = new byte[128]; + private int writePos = 0; + /** The dictionary for mapping keys to monotonically increasing ids. */ + private final HashMap dictionary = new HashMap<>(); + /** The keys in the dictionary, in id order. */ + private final ArrayList dictionaryKeys = new ArrayList<>(); + + private final boolean allowDuplicateKeys; + private final int sizeLimitBytes; +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java new file mode 100644 index 0000000000..12e94416c4 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the Variant contains a duplicate key. + */ +public class VariantDuplicateKeyException extends RuntimeException { + public final String key; + + /** + * @param key the key that was duplicated + */ + public VariantDuplicateKeyException(String key) { + super("Failed to build Variant because of duplicate object key: " + key); + this.key = key; + } + + /** + * @return the key that was duplicated + */ + public String getKey() { + return key; + } +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java new file mode 100644 index 0000000000..08556e762e --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +/** + * An exception indicating that the metadata or data size of the Variant exceeds the + * configured size limit. + */ +public class VariantSizeLimitException extends RuntimeException { +} diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java new file mode 100644 index 0000000000..aeebfe67e1 --- /dev/null +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -0,0 +1,646 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.Arrays; + +/** + * This class defines constants related to the Variant format and provides functions for + * manipulating Variant binaries. + + * A Variant is made up of 2 binaries: value and metadata. A Variant value consists of a one-byte + * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits + * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in + * the below constants for all possible basic type and type info values. + + * The Variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). + * Its binary format is: + * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. + * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the + * dictionary. + * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the + * starting position of string i, counting starting from the address of `offsets[0]`. Strings + * must be stored contiguously, so we don’t need to store the string size, instead, we compute it + * with `offset[i + 1] - offset[i]`. + * - UTF-8 string data. + */ +public class VariantUtil { + public static final int BASIC_TYPE_BITS = 2; + public static final int BASIC_TYPE_MASK = 0x3; + public static final int TYPE_INFO_MASK = 0x3F; + /** The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. */ + public static final int MAX_SHORT_STR_SIZE = 0x3F; + + // The basic types + + /** + * Primitive value. + * The type info value must be one of the values in the "Primitive" section below. + */ + public static final int PRIMITIVE = 0; + /** + * Short string value. + * The type info value is the string size, which must be in `[0, MAX_SHORT_STR_SIZE]`. + * The string content bytes directly follow the header byte. + */ + public static final int SHORT_STR = 1; + /** + * Object value. + * The content contains a size, a list of field ids, a list of field offsets, and + * the actual field values. The list of field ids has `size` ids, while the list of field offsets + * has `size + 1` offsets, where the last offset represents the total size of the field values + * data. The list of fields ids must be sorted by the field name in alphabetical order. + * Duplicate field names within one object are not allowed. + * 5 bits in the type info are used to specify the integer type of the object header. It is + * 0_b4_b3b2_b1b0 (MSB is 0), where: + * - b4: the integer type of size. When it is 0/1, `size` is a little-endian 1/4-byte + * unsigned integer. + * - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list contains + * 1/2/3-byte little-endian unsigned integers. + * - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the offset list contains + * 1/2/3-byte little-endian unsigned integers. + */ + public static final int OBJECT = 2; + /** + * Array value. + * The content contains a size, a list of field offsets, and the actual element values. + * It is similar to an object without the id list. The length of the offset list + * is `size + 1`, where the last offset represent the total size of the element data. + * Its type info is: 000_b2_b1b0: + * - b2: the type of size. + * - b1b0: the integer type of offset. + */ + public static final int ARRAY = 3; + + // The primitive types + + /** JSON Null value. Empty content. */ + public static final int NULL = 0; + /** True value. Empty content. */ + public static final int TRUE = 1; + /** False value. Empty content. */ + public static final int FALSE = 2; + /** 1-byte little-endian signed integer. */ + public static final int INT1 = 3; + /** 2-byte little-endian signed integer. */ + public static final int INT2 = 4; + /** 4-byte little-endian signed integer. */ + public static final int INT4 = 5; + /** 4-byte little-endian signed integer. */ + public static final int INT8 = 6; + /** 8-byte IEEE double. */ + public static final int DOUBLE = 7; + /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. */ + public static final int DECIMAL4 = 8; + /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. */ + public static final int DECIMAL8 = 9; + /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. */ + public static final int DECIMAL16 = 10; + /** + * Date value. Content is 4-byte little-endian signed integer that represents the + * number of days from the Unix epoch. + */ + public static final int DATE = 11; + /** + * Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in + * their local time zones and may be displayed differently depending on the execution environment. + */ + public static final int TIMESTAMP = 12; + /** + * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + * as if the local time zone is UTC. + */ + public static final int TIMESTAMP_NTZ = 13; + /** 4-byte IEEE float. */ + public static final int FLOAT = 14; + /** + * Binary value. The content is (4-byte little-endian unsigned integer representing the binary + * size) + (size bytes of binary content). + */ + public static final int BINARY = 15; + /** + * Long string value. The content is (4-byte little-endian unsigned integer representing the + * string size) + (size bytes of string content). + */ + public static final int LONG_STR = 16; + + // The metadata version. + public static final byte VERSION = 1; + // The lower 4 bits of the first metadata byte contain the version. + public static final byte VERSION_MASK = 0x0F; + + // Constants for various unsigned integer sizes. + public static final int U8_MAX = 0xFF; + public static final int U16_MAX = 0xFFFF; + public static final int U24_MAX = 0xFFFFFF; + public static final int U24_SIZE = 3; + public static final int U32_SIZE = 4; + + // Max decimal precision for each decimal type. + public static final int MAX_DECIMAL4_PRECISION = 9; + public static final int MAX_DECIMAL8_PRECISION = 18; + public static final int MAX_DECIMAL16_PRECISION = 38; + + // Default size limit for both variant value and variant metadata. + public static final int DEFAULT_SIZE_LIMIT = U24_MAX + 1; + + /** + * Write the least significant `numBytes` bytes in `value` into `bytes[pos, pos + numBytes)` in + * little endian. + * @param bytes The byte array to write into + * @param pos The starting index of the byte array to write into + * @param value The value to write + * @param numBytes The number of bytes to write + */ + public static void writeLong(byte[] bytes, int pos, long value, int numBytes) { + for (int i = 0; i < numBytes; ++i) { + bytes[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + } + + public static byte primitiveHeader(int type) { + return (byte) (type << 2 | PRIMITIVE); + } + + public static byte shortStrHeader(int size) { + return (byte) (size << 2 | SHORT_STR); + } + + public static byte objectHeader(boolean largeSize, int idSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) | + ((idSize - 1) << (BASIC_TYPE_BITS + 2)) | + ((offsetSize - 1) << BASIC_TYPE_BITS) | OBJECT); + } + + public static byte arrayHeader(boolean largeSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | + ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); + } + + public static MalformedVariantException malformedVariant() { + return new MalformedVariantException(); + } + + public static UnknownVariantTypeException unknownPrimitiveTypeInVariant(int id) { + return new UnknownVariantTypeException(id); + } + + /** + * Check the validity of an array index `pos`. + * @param pos The index to check + * @param length The length of the array + * @throws MalformedVariantException if the index is out of bound + */ + public static void checkIndex(int pos, int length) { + if (pos < 0 || pos >= length) throw malformedVariant(); + } + + /** + * Reads a little-endian signed long value from `bytes[pos, pos + numBytes)`. + * @param bytes The byte array to read from + * @param pos The starting index of the byte array to read from + * @param numBytes The number of bytes to read + * @return The long value + */ + static long readLong(byte[] bytes, int pos, int numBytes) { + checkIndex(pos, bytes.length); + checkIndex(pos + numBytes - 1, bytes.length); + long result = 0; + // All bytes except the most significant byte should be unsigned-extended and shifted + // (so we need & 0xFF`). The most significant byte should be sign-extended and is handled + // after the loop. + for (int i = 0; i < numBytes - 1; ++i) { + long unsignedByteValue = bytes[pos + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + long signedByteValue = bytes[pos + numBytes - 1]; + result |= signedByteValue << (8 * (numBytes - 1)); + return result; + } + + /** + * Read a little-endian unsigned int value from `bytes[pos, pos + numBytes)`. The value must fit + * into a non-negative int (`[0, Integer.MAX_VALUE]`). + */ + static int readUnsigned(byte[] bytes, int pos, int numBytes) { + checkIndex(pos, bytes.length); + checkIndex(pos + numBytes - 1, bytes.length); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsigned-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes[pos + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) throw malformedVariant(); + return result; + } + + /** + * The value type of Variant value. It is determined by the header byte but not a 1:1 mapping + * (for example, INT1/2/4/8 all maps to `Type.LONG`). + */ + public enum Type { + OBJECT, + ARRAY, + NULL, + BOOLEAN, + LONG, + STRING, + DOUBLE, + DECIMAL, + DATE, + TIMESTAMP, + TIMESTAMP_NTZ, + FLOAT, + BINARY, + } + + public static int getTypeInfo(byte[] value, int pos) { + checkIndex(pos, value.length); + return (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + } + + /** + * Returns the value type of Variant value `value[pos...]`. It is only legal to call `get*` if + * `getType` returns the corresponding type. For example, it is only legal to call + * `getLong` if this method returns `Type.Long`. + * @param value The Variant value to get the type from + * @param pos The starting index of the Variant value + * @return The type of the Variant value + */ + public static Type getType(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + switch (basicType) { + case SHORT_STR: + return Type.STRING; + case OBJECT: + return Type.OBJECT; + case ARRAY: + return Type.ARRAY; + default: + switch (typeInfo) { + case NULL: + return Type.NULL; + case TRUE: + case FALSE: + return Type.BOOLEAN; + case INT1: + case INT2: + case INT4: + case INT8: + return Type.LONG; + case DOUBLE: + return Type.DOUBLE; + case DECIMAL4: + case DECIMAL8: + case DECIMAL16: + return Type.DECIMAL; + case DATE: + return Type.DATE; + case TIMESTAMP: + return Type.TIMESTAMP; + case TIMESTAMP_NTZ: + return Type.TIMESTAMP_NTZ; + case FLOAT: + return Type.FLOAT; + case BINARY: + return Type.BINARY; + case LONG_STR: + return Type.STRING; + default: + throw unknownPrimitiveTypeInVariant(typeInfo); + } + } + } + + /** + * Computes the actual size (in bytes) of the Variant value at `value[pos...]`. + * `value.length - pos` is an upper bound of the size, but the actual size may be smaller. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @return The actual size of the Variant value + * @throws MalformedVariantException if the Variant is malformed + */ + public static int valueSize(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + switch (basicType) { + case SHORT_STR: + return 1 + typeInfo; + case OBJECT: + return handleObject(value, pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + case ARRAY: + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + default: + switch (typeInfo) { + case NULL: + case TRUE: + case FALSE: + return 1; + case INT1: + return 2; + case INT2: + return 3; + case INT4: + case DATE: + case FLOAT: + return 5; + case INT8: + case DOUBLE: + case TIMESTAMP: + case TIMESTAMP_NTZ: + return 9; + case DECIMAL4: + return 6; + case DECIMAL8: + return 10; + case DECIMAL16: + return 18; + case BINARY: + case LONG_STR: + return 1 + U32_SIZE + readUnsigned(value, pos + 1, U32_SIZE); + default: + throw unknownPrimitiveTypeInVariant(typeInfo); + } + } + } + + private static IllegalStateException unexpectedType(Type type) { + return new IllegalStateException("Expect type to be " + type); + } + + public static boolean getBoolean(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { + throw unexpectedType(Type.BOOLEAN); + } + return typeInfo == TRUE; + } + + /** + * Returns a long value from Variant value `value[pos...]`. + * It is only legal to call it if `getType` returns one of Type.LONG, DATE, TIMESTAMP, + * TIMESTAMP_NTZ. + * If the type is `DATE`, the return value is guaranteed to fit into an int and + * represents the number of days from the Unix epoch. + * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of + * microseconds from the Unix epoch. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @return The long value + */ + public static long getLong(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; + if (basicType != PRIMITIVE) throw new IllegalStateException(exceptionMessage); + switch (typeInfo) { + case INT1: + return readLong(value, pos + 1, 1); + case INT2: + return readLong(value, pos + 1, 2); + case INT4: + case DATE: + return readLong(value, pos + 1, 4); + case INT8: + case TIMESTAMP: + case TIMESTAMP_NTZ: + return readLong(value, pos + 1, 8); + default: + throw new IllegalStateException(exceptionMessage); + } + } + + public static double getDouble(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != DOUBLE) throw unexpectedType(Type.DOUBLE); + return Double.longBitsToDouble(readLong(value, pos + 1, 8)); + } + + /** + * Checks whether the precision and scale of the decimal are within the limit. + * @param d The decimal value to check + * @param maxPrecision The maximum precision allowed + * @throws MalformedVariantException if the decimal is malformed + */ + private static void checkDecimal(BigDecimal d, int maxPrecision) { + if (d.precision() > maxPrecision || d.scale() > maxPrecision) { + throw malformedVariant(); + } + } + + public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); + // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be + // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. + int scale = value[pos + 1] & 0xFF; + BigDecimal result; + switch (typeInfo) { + case DECIMAL4: + result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); + checkDecimal(result, MAX_DECIMAL4_PRECISION); + break; + case DECIMAL8: + result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); + checkDecimal(result, MAX_DECIMAL8_PRECISION); + break; + case DECIMAL16: + checkIndex(pos + 17, value.length); + byte[] bytes = new byte[16]; + // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian + // representation. + for (int i = 0; i < 16; ++i) { + bytes[i] = value[pos + 17 - i]; + } + result = new BigDecimal(new BigInteger(bytes), scale); + checkDecimal(result, MAX_DECIMAL16_PRECISION); + break; + default: + throw unexpectedType(Type.DECIMAL); + } + return result; + } + + public static BigDecimal getDecimal(byte[] value, int pos) { + return getDecimalWithOriginalScale(value, pos); + } + + public static float getFloat(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != FLOAT) throw unexpectedType(Type.FLOAT); + return Float.intBitsToFloat((int) readLong(value, pos + 1, 4)); + } + + public static byte[] getBinary(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != BINARY) throw unexpectedType(Type.BINARY); + int start = pos + 1 + U32_SIZE; + int length = readUnsigned(value, pos + 1, U32_SIZE); + checkIndex(start + length - 1, value.length); + return Arrays.copyOfRange(value, start, start + length); + } + + public static String getString(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { + int start; + int length; + if (basicType == SHORT_STR) { + start = pos + 1; + length = typeInfo; + } else { + start = pos + 1 + U32_SIZE; + length = readUnsigned(value, pos + 1, U32_SIZE); + } + checkIndex(start + length - 1, value.length); + return new String(value, start, length); + } + throw unexpectedType(Type.STRING); + } + + /** + * An interface for the Variant object handler. + * @param The return type of the handler + */ + public interface ObjectHandler { + /** + * @param size Number of object fields. + * @param idSize The integer size of the field id list. + * @param offsetSize The integer size of the offset list. + * @param idStart The starting index of the field id list in the variant value array. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of field data in the variant value array. + */ + T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart); + } + + /** + * A helper function to access a Variant object, at `value[pos...]`. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @param handler The handler to process the object + * @return The result of the handler + * @param The return type of the handler + */ + public static T handleObject(byte[] value, int pos, ObjectHandler handler) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != OBJECT) throw unexpectedType(Type.OBJECT); + // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts + // b4 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, pos + 1, sizeBytes); + // Extracts b3b2 to determine the integer size of the field id list. + int idSize = ((typeInfo >> 2) & 0x3) + 1; + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int idStart = pos + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart); + } + + /** + * An interface for the Variant array handler. + * @param The return type of the handler + */ + public interface ArrayHandler { + /** + * @param size Number of array elements. + * @param offsetSize The integer size of the offset list. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of element data in the variant value array. + */ + T apply(int size, int offsetSize, int offsetStart, int dataStart); + } + + /** + * A helper function to access a Variant array, at `value[pos...]`. + * @param value The Variant value + * @param pos The starting index of the Variant value + * @param handler The handler to process the array + * @return The result of the handler + * @param The return type of the handler + */ + public static T handleArray(byte[] value, int pos, ArrayHandler handler) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != ARRAY) throw unexpectedType(Type.ARRAY); + // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts + // b2 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, pos + 1, sizeBytes); + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int offsetStart = pos + 1 + sizeBytes; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, offsetSize, offsetStart, dataStart); + } + + /** + * Returns a key at `id` in the Variant metadata. + * @param metadata The Variant metadata + * @param id The key id + * @return The key + * @throws MalformedVariantException if the Variant is malformed or if the id is out of bounds + */ + public static String getMetadataKey(byte[] metadata, int id) { + checkIndex(0, metadata.length); + // Extracts the highest 2 bits in the metadata header to determine the integer size of the + // offset list. + int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, 1, offsetSize); + if (id >= dictSize) throw malformedVariant(); + // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets + // before the string data. + int stringStart = 1 + (dictSize + 2) * offsetSize; + int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); + int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); + if (offset > nextOffset) throw malformedVariant(); + checkIndex(stringStart + nextOffset - 1, metadata.length); + return new String(metadata, stringStart + offset, nextOffset - offset); + } +} diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java new file mode 100644 index 0000000000..ea661df4dd --- /dev/null +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.variant; + +import java.io.IOException; +import java.math.BigDecimal; +import java.security.SecureRandom; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.Arrays; +import java.util.Base64; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.stream.IntStream; +import com.fasterxml.jackson.core.*; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class TestVariantEncoding { + private static final Logger LOG = LoggerFactory.getLogger(TestVariantEncoding.class); + private static final String RANDOM_CHARS = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + private static final List SAMPLE_JSON_VALUES = Arrays.asList( + "null", + "true", + "false", + "12", + "-9876543210", + "4.5678E123", + "8.765E-2", + "\"string value\"", + "-9876.543", + "234.456789", + "{\"a\": 1, \"b\": {\"e\": -4, \"f\": 5.5}, \"c\": true}", + "[1, -2, 4.5, -6.7, \"str\", true]" + ); + + /** Random number generator for generating random strings */ + private static SecureRandom random = new SecureRandom(); + /** Object mapper for comparing json values */ + private final ObjectMapper mapper = new ObjectMapper(); + + private void checkJson(String jsonValue) { + try { + StreamReadConstraints.overrideDefaultStreamReadConstraints( + StreamReadConstraints.builder().maxNestingDepth(100000).build()); + Variant v = VariantBuilder.parseJson(jsonValue); + Assert.assertEquals(mapper.readTree(jsonValue), + mapper.readTree(v.toJson(ZoneId.systemDefault()))); + } catch (IOException e) { + Assert.fail("Failed to parse json: " + jsonValue + " " + e); + } + } + + private void checkType(Variant v, int expectedBasicType, int expectedTypeInfo) { + Assert.assertEquals(expectedBasicType, v.value[v.pos] & VariantUtil.BASIC_TYPE_MASK); + Assert.assertEquals(expectedTypeInfo, v.getTypeInfo()); + } + + private long microsSinceEpoch(Instant instant) { + return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000; + } + + private String randomString(int len) { + StringBuilder sb = new StringBuilder(len); + for (int i = 0; i < len; i++) { + sb.append(RANDOM_CHARS.charAt(random.nextInt(RANDOM_CHARS.length()))); + } + return sb.toString(); + } + + @Test + public void testNullJson() { + checkJson("null"); + } + + @Test + public void testBooleanJson() { + Arrays.asList("true", "false").forEach(this::checkJson); + } + + @Test + public void testIntegerJson() { + Arrays.asList( + "0", + Byte.toString(Byte.MIN_VALUE), Byte.toString(Byte.MAX_VALUE), + Short.toString(Short.MIN_VALUE), Short.toString(Short.MAX_VALUE), + Integer.toString(Integer.MIN_VALUE), Integer.toString(Integer.MAX_VALUE), + Long.toString(Long.MIN_VALUE), Long.toString(Long.MAX_VALUE) + ).forEach(this::checkJson); + } + + @Test + public void testFloatJson() { + Arrays.asList( + Float.toString(Float.MIN_VALUE), Float.toString(Float.MAX_VALUE), + Double.toString(Double.MIN_VALUE), Double.toString(Double.MAX_VALUE) + ).forEach(this::checkJson); + } + + @Test + public void testStringJson() { + Arrays.asList( + "\"short string\"", + "\"long string: " + new String(new char[1000]).replace("\0", "x") + "\"" + ).forEach(this::checkJson); + } + + @Test + public void testDecimalJson() { + Arrays.asList( + "12.34", "-43.21", + "10.2147483647", "-1021474836.47", + "109223372036854775.807", "-109.223372036854775807" + ).forEach(this::checkJson); + } + + @Test + public void testNullBuilder() { + VariantBuilder vb = new VariantBuilder(false); + vb.appendNull(); + checkType(vb.result(), VariantUtil.NULL, 0); + } + + @Test + public void testBooleanBuilder() { + Arrays.asList(true, false).forEach( b -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendBoolean(b); + checkType(vb2.result(), VariantUtil.PRIMITIVE, b ? VariantUtil.TRUE : VariantUtil.FALSE); + }); + } + + @Test + public void testIntegerBuilder() { + Arrays.asList( + 0L, + (long)Byte.MIN_VALUE, (long)Byte.MAX_VALUE, + (long)Short.MIN_VALUE, (long)Short.MAX_VALUE, + (long)Integer.MIN_VALUE, (long)Integer.MAX_VALUE, + Long.MIN_VALUE, Long.MAX_VALUE + ).forEach( l -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendLong(l); + Variant v = vb2.result(); + if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT1); + } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT2); + } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT4); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8); + } + Assert.assertEquals((long)l, v.getLong()); + }); + } + + @Test + public void testFloatBuilder() { + Arrays.asList(Float.MIN_VALUE, Float.MAX_VALUE).forEach( f -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendFloat(f); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.FLOAT); + Assert.assertEquals(f, v.getFloat(), 0.000001); + }); + } + + @Test + public void testDoubleBuilder() { + Arrays.asList(Double.MIN_VALUE, Double.MAX_VALUE).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDouble(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DOUBLE); + Assert.assertEquals(d, v.getDouble(), 0.000001); + }); + } + + @Test + public void testStringBuilder() { + IntStream.range(VariantUtil.MAX_SHORT_STR_SIZE - 3, + VariantUtil.MAX_SHORT_STR_SIZE + 3).forEach( len -> { + VariantBuilder vb2 = new VariantBuilder(false); + String s = randomString(len); + vb2.appendString(s); + Variant v = vb2.result(); + if (len <= VariantUtil.MAX_SHORT_STR_SIZE) { + checkType(v, VariantUtil.SHORT_STR, len); + } else { + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.LONG_STR); + } + Assert.assertEquals(s, v.getString()); + }); + } + + @Test + public void testDecimalBuilder() { + // decimal4 + Arrays.asList(new BigDecimal("123.456"), new BigDecimal("-987.654")).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL4); + Assert.assertEquals(d, v.getDecimal()); + }); + + // decimal8 + Arrays.asList( + new BigDecimal("10.2147483647"), + new BigDecimal("-1021474836.47") + ).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL8); + Assert.assertEquals(d, v.getDecimal()); + }); + + // decimal16 + Arrays.asList( + new BigDecimal("109223372036854775.807"), + new BigDecimal("-109.223372036854775807") + ).forEach( d -> { + VariantBuilder vb2 = new VariantBuilder(false); + vb2.appendDecimal(d); + Variant v = vb2.result(); + checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL16); + Assert.assertEquals(d, v.getDecimal()); + }); + } + + @Test + public void testDate() { + VariantBuilder vb = new VariantBuilder(false); + int days = Math.toIntExact(LocalDate.of(2024, 12, 16).toEpochDay()); + vb.appendDate(days); + Assert.assertEquals("\"2024-12-16\"", vb.result().toJson(ZoneId.systemDefault())); + Assert.assertEquals(days, vb.result().getLong()); + } + + @Test + public void testTimestamp() { + VariantBuilder vb = new VariantBuilder(false); + long micros = microsSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456-08:00")); + vb.appendTimestamp(micros); + Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"", + vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"", + vb.result().toJson(ZoneId.of("+01:00"))); + Assert.assertEquals(micros, vb.result().getLong()); + } + + @Test + public void testTimestampNtz() { + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; + VariantBuilder vb = new VariantBuilder(false); + long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.000001Z"))); + vb.appendTimestampNtz(micros); + Assert.assertEquals("\"2024-01-01T23:00:00.000001\"", + vb.result().toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")), + vb.result().toJson(ZoneId.of("+02:00"))); + Assert.assertEquals(micros, vb.result().getLong()); + } + + @Test + public void testBinary() { + VariantBuilder vb = new VariantBuilder(false); + byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + vb.appendBinary(binary); + Assert.assertEquals("\"" + Base64.getEncoder().encodeToString(binary) + "\"", + vb.result().toJson(ZoneId.systemDefault())); + Assert.assertArrayEquals(binary, vb.result().getBinary()); + } + + @Test + public void testObject() { + // simple object + StringBuilder sb = new StringBuilder(); + sb.append("{"); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i)); + } + sb.append("}"); + checkJson(sb.toString()); + + // wide object + sb = new StringBuilder(); + sb.append("{"); + for (int i = 0; i < 50000; i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ") + .append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size())); + } + sb.append("}"); + checkJson(sb.toString()); + + // deep object + sb = new StringBuilder(); + // Jackson object mapper hit a stack overflow if json is too deep + for (int i = 0; i < 1000; i++) { + sb.append("{").append("\"field" + i + "\": "); + } + sb.append("{"); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + if (i > 0) sb.append(", "); + sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i)); + } + sb.append("}"); + for (int i = 0; i < 1000; i++) { + sb.append("}"); + } + checkJson(sb.toString()); + } + + @Test + public void testArray() { + // simple array + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) { + if (i > 0) sb.append(", "); + sb.append(SAMPLE_JSON_VALUES.get(i)); + } + sb.append("]"); + checkJson(sb.toString()); + + // large array + sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < 50000; i++) { + if (i > 0) sb.append(", "); + sb.append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size())); + } + sb.append("]"); + checkJson(sb.toString()); + } + + @Test + public void testSizeLimit() { + // large metadata size + try { + VariantBuilder.parseJson( + "{\"12345678901234567890\": 1, \"123456789012345678901\": 2}", + new VariantBuilder(false, 20)); + Assert.fail("Expected VariantSizeLimitException with large metadata"); + } catch (IOException e) { + Assert.fail("Expected VariantSizeLimitException with large metadata"); + } catch (VariantSizeLimitException e) { + // Expected + } + + // large data size + try { + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (int i = 0; i < 100; i++) { + if (i > 0) sb.append(", "); + sb.append("{\"a\":1}"); + } + sb.append("]"); + VariantBuilder.parseJson(sb.toString(), new VariantBuilder(false, 20)); + Assert.fail("Expected VariantSizeLimitException with large data"); + } catch (IOException e) { + Assert.fail("Expected VariantSizeLimitException with large data"); + } catch (VariantSizeLimitException e) { + // Expected + } + } + + @Test + public void testAllowDuplicateKeys() { + // disallow duplicate keys + try { + VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}"); + Assert.fail("Expected VariantDuplicateKeyException with duplicate keys"); + } catch (IOException e) { + Assert.fail("Expected VariantDuplicateKeyException with duplicate keys"); + } catch (VariantDuplicateKeyException e) { + // Expected + } + + // allow duplicate keys + try { + Variant v = VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}", + new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT)); + Assert.assertEquals(1, v.objectSize()); + Assert.assertEquals(VariantUtil.Type.LONG, v.getFieldByKey("a").getType()); + Assert.assertEquals(2, v.getFieldByKey("a").getLong()); + } catch (Exception e) { + Assert.fail("Unexpected exception: " + e); + } + } + + @Test + public void testTruncateTrailingZeroDecimal() { + for (String[] strings : Arrays.asList( + // decimal4 + // truncate all trailing zeros + new String[]{"1234.0000", "1234"}, + // truncate some trailing zeros + new String[]{"1234.5600", "1234.56"}, + // truncate no trailing zeros + new String[]{"1234.5678", "1234.5678"}, + // decimal8 + // truncate all trailing zeros + new String[]{"-10.0000000000", "-10"}, + // truncate some trailing zeros + new String[]{"-10.2147000000", "-10.2147"}, + // truncate no trailing zeros + new String[]{"-10.2147483647", "-10.2147483647"}, + // decimal16 + // truncate all trailing zeros + new String[]{"1092233720368547.00000", "1092233720368547"}, + // truncate some trailing zeros + new String[]{"1092233720368547.75800", "1092233720368547.758"}, + // truncate no trailing zeros + new String[]{"1092233720368547.75807", "1092233720368547.75807"})) { + VariantBuilder vb = new VariantBuilder(false); + BigDecimal d = new BigDecimal(strings[0]); + vb.appendDecimal(d); + Variant v = vb.result(); + Assert.assertEquals(strings[0], v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(strings[1], v.toJson(ZoneId.of("-08:00"), true)); + } + } + + @Test + public void testTruncateTrailingZeroTimestamp() { + // timestamp + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"}, + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.123000-08:00", "2024-12-16T10:23:45.123-08:00"}, + // truncate no trailing zeros + new String[] {"2024-12-16T10:23:45.123456-08:00", "2024-12-16T10:23:45.123456-08:00"})) { + VariantBuilder vb = new VariantBuilder(false); + long micros = microsSinceEpoch(Instant.parse(strings[0])); + vb.appendTimestamp(micros); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + } + + // timestampNTZ + DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; + for (String[] strings : Arrays.asList( + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.000000", "2024-12-16T10:23:45"}, + // truncate all trailing zeros + new String[] {"2024-12-16T10:23:45.123000", "2024-12-16T10:23:45.123"}, + // truncate no trailing zeros + new String[] {"2024-12-16T10:23:45.123456", "2024-12-16T10:23:45.123456"})) { + VariantBuilder vb = new VariantBuilder(false); + + long micros = microsSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0])))); + vb.appendTimestampNtz(micros); + Variant v = vb.result(); + Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); + Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); + Assert.assertEquals(micros, vb.result().getLong()); + } + } +} diff --git a/pom.xml b/pom.xml index 2496171867..5f49bf1764 100644 --- a/pom.xml +++ b/pom.xml @@ -165,6 +165,7 @@ parquet-protobuf parquet-thrift parquet-hadoop-bundle + parquet-variant From c5d19e652b22380407fe277ab9bc9c82a90564d3 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 09:42:45 -0800 Subject: [PATCH 2/5] remove optional --- .../src/main/java/org/apache/parquet/variant/Variant.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index 4fcdb6b0e5..dfdb85b96e 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -323,7 +323,6 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendValue(HOUR_OF_DAY, 2) .appendLiteral(':') .appendValue(MINUTE_OF_HOUR, 2) - .optionalStart() .appendLiteral(':') .appendValue(SECOND_OF_MINUTE, 2) .appendFraction(MICRO_OF_SECOND, 6, 6, true) @@ -343,7 +342,6 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendValue(HOUR_OF_DAY, 2) .appendLiteral(':') .appendValue(MINUTE_OF_HOUR, 2) - .optionalStart() .appendLiteral(':') .appendValue(SECOND_OF_MINUTE, 2) .optionalStart() From 0086b3476e5badbfea775a5bbdf55d488d6019b7 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 10:46:45 -0800 Subject: [PATCH 3/5] split test --- .../java/org/apache/parquet/variant/TestVariantEncoding.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java index ea661df4dd..0a8740e3c0 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java @@ -452,7 +452,6 @@ public void testTruncateTrailingZeroDecimal() { @Test public void testTruncateTrailingZeroTimestamp() { - // timestamp for (String[] strings : Arrays.asList( // truncate all trailing zeros new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"}, @@ -467,8 +466,10 @@ public void testTruncateTrailingZeroTimestamp() { Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00"))); Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true)); } + } - // timestampNTZ + @Test + public void testTruncateTrailingZeroTimestampNtz() { DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME; for (String[] strings : Arrays.asList( // truncate all trailing zeros From 5af337ffe3214a66e7dee16b704eb0a507d719b2 Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 10:50:24 -0800 Subject: [PATCH 4/5] cleanup --- .../org/apache/parquet/variant/Variant.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index dfdb85b96e..d88c38f0df 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -320,11 +320,7 @@ private static void appendQuoted(StringBuilder sb, String str) { private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE) .appendLiteral('T') - .appendValue(HOUR_OF_DAY, 2) - .appendLiteral(':') - .appendValue(MINUTE_OF_HOUR, 2) - .appendLiteral(':') - .appendValue(SECOND_OF_MINUTE, 2) + .appendPattern("HH:mm:ss") .appendFraction(MICRO_OF_SECOND, 6, 6, true) .toFormatter(Locale.US); @@ -339,13 +335,10 @@ private static void appendQuoted(StringBuilder sb, String str) { new DateTimeFormatterBuilder() .append(DateTimeFormatter.ISO_LOCAL_DATE) .appendLiteral('T') - .appendValue(HOUR_OF_DAY, 2) - .appendLiteral(':') - .appendValue(MINUTE_OF_HOUR, 2) - .appendLiteral(':') - .appendValue(SECOND_OF_MINUTE, 2) + .appendPattern("HH:mm:ss") .optionalStart() .appendFraction(MICRO_OF_SECOND, 0, 6, true) + .optionalEnd() .toFormatter(Locale.US); /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */ @@ -354,8 +347,8 @@ private static void appendQuoted(StringBuilder sb, String str) { .appendOffset("+HH:MM", "+00:00") .toFormatter(Locale.US); - private static Instant microsToInstant(long timestamp) { - return Instant.EPOCH.plus(timestamp, ChronoUnit.MICROS); + private static Instant microsToInstant(long microsSinceEpoch) { + return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS); } private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb, From 599773287c3ce96de9809497c9969c4dcf98948f Mon Sep 17 00:00:00 2001 From: Gene Pang Date: Tue, 7 Jan 2025 13:47:07 -0800 Subject: [PATCH 5/5] cleanup comment --- .../src/main/java/org/apache/parquet/variant/Variant.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java index d88c38f0df..acb635119f 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java @@ -301,8 +301,6 @@ private static String escapeJson(String str) { } } - // A simplified and more performant version of `sb.append(escapeJson(str))`. It is used when we - // know `str` doesn't contain any special character that needs escaping. /** * Appends a quoted string to a StringBuilder. It is used when we know `str` doesn't contain any * special characters that needs escaping. This is more performant than