diff --git a/parquet-variant/pom.xml b/parquet-variant/pom.xml
new file mode 100644
index 0000000000..6bfc2ff525
--- /dev/null
+++ b/parquet-variant/pom.xml
@@ -0,0 +1,88 @@
+
+
+
+ org.apache.parquet
+ parquet
+ ../pom.xml
+ 1.16.0-SNAPSHOT
+
+
+ 4.0.0
+
+ parquet-variant
+ jar
+
+ Apache Parquet Variant
+ https://parquet.apache.org
+
+
+
+
+
+
+ org.apache.parquet
+ parquet-jackson
+ ${project.version}
+ runtime
+
+
+ ${jackson.groupId}
+ jackson-core
+ ${jackson.version}
+
+
+ ${jackson.groupId}
+ jackson-databind
+ ${jackson-databind.version}
+ test
+
+
+ com.google.guava
+ guava
+ ${guava.version}
+ test
+
+
+ org.slf4j
+ slf4j-log4j12
+ ${slf4j.version}
+ test
+
+
+ org.slf4j
+ slf4j-api
+ ${slf4j.version}
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+
+
+
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java
new file mode 100644
index 0000000000..e9bff469d2
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+/**
+ * An exception indicating that the Variant is malformed.
+ */
+public class MalformedVariantException extends RuntimeException {
+}
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java
new file mode 100644
index 0000000000..2f0bd5dce6
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/UnknownVariantTypeException.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+/**
+ * An exception indicating that the Variant contains an unknown type.
+ */
+public class UnknownVariantTypeException extends RuntimeException {
+ public final int typeId;
+
+ /**
+ * @param typeId the type id that was unknown
+ */
+ public UnknownVariantTypeException(int typeId) {
+ super("Unknown type in Variant. id: " + typeId);
+ this.typeId = typeId;
+ }
+
+ /**
+ * @return the type id that was unknown
+ */
+ public int getTypeId() {
+ return typeId;
+ }
+}
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java
new file mode 100644
index 0000000000..acb635119f
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeFormatterBuilder;
+import java.time.temporal.ChronoUnit;
+import java.util.Arrays;
+import java.util.Base64;
+import java.util.Locale;
+
+import static java.time.temporal.ChronoField.*;
+import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
+import static org.apache.parquet.variant.VariantUtil.*;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary values.
+ */
+public final class Variant {
+ final byte[] value;
+ final byte[] metadata;
+ /**
+ * The starting index into `value` where the variant value starts. This is used to avoid copying
+ * the value binary when reading a sub-variant in the array/object element.
+ */
+ final int pos;
+
+ public Variant(byte[] value, byte[] metadata) {
+ this(value, metadata, 0);
+ }
+
+ Variant(byte[] value, byte[] metadata, int pos) {
+ this.value = value;
+ this.metadata = metadata;
+ this.pos = pos;
+ // There is currently only one allowed version.
+ if (metadata.length < 1 || (metadata[0] & VERSION_MASK) != VERSION) {
+ throw malformedVariant();
+ }
+ }
+
+ public byte[] getValue() {
+ if (pos == 0) return value;
+ int size = valueSize(value, pos);
+ checkIndex(pos + size - 1, value.length);
+ return Arrays.copyOfRange(value, pos, pos + size);
+ }
+
+ public byte[] getMetadata() {
+ return metadata;
+ }
+
+ /**
+ * @return the boolean value
+ */
+ public boolean getBoolean() {
+ return VariantUtil.getBoolean(value, pos);
+ }
+
+ /**
+ * @return the long value
+ */
+ public long getLong() {
+ return VariantUtil.getLong(value, pos);
+ }
+
+ /**
+ * @return the double value
+ */
+ public double getDouble() {
+ return VariantUtil.getDouble(value, pos);
+ }
+
+ /**
+ * @return the decimal value
+ */
+ public BigDecimal getDecimal() {
+ return VariantUtil.getDecimal(value, pos);
+ }
+
+ /**
+ * @return the float value
+ */
+ public float getFloat() {
+ return VariantUtil.getFloat(value, pos);
+ }
+
+ /**
+ * @return the binary value
+ */
+ public byte[] getBinary() {
+ return VariantUtil.getBinary(value, pos);
+ }
+
+ /**
+ * @return the string value
+ */
+ public String getString() {
+ return VariantUtil.getString(value, pos);
+ }
+
+ /**
+ * @return the type info bits from a variant value
+ */
+ public int getTypeInfo() {
+ return VariantUtil.getTypeInfo(value, pos);
+ }
+
+ /**
+ * @return the type of the variant value
+ */
+ public Type getType() {
+ return VariantUtil.getType(value, pos);
+ }
+
+ /**
+ * @return the number of object fields in the variant. `getType()` must be `Type.OBJECT`.
+ */
+ public int objectSize() {
+ return handleObject(value, pos,
+ (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size);
+ }
+
+ // Find the field value whose key is equal to `key`. Return null if the key is not found.
+ // It is only legal to call it when `getType()` is `Type.OBJECT`.
+
+ /**
+ * Returns the object field Variant value whose key is equal to `key`.
+ * Return null if the key is not found. `getType()` must be `Type.OBJECT`.
+ * @param key the key to look up
+ * @return the field value whose key is equal to `key`, or null if key is not found
+ */
+ public Variant getFieldByKey(String key) {
+ return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+ // Use linear search for a short list. Switch to binary search when the length reaches
+ // `BINARY_SEARCH_THRESHOLD`.
+ final int BINARY_SEARCH_THRESHOLD = 32;
+ if (size < BINARY_SEARCH_THRESHOLD) {
+ for (int i = 0; i < size; ++i) {
+ int id = readUnsigned(value, idStart + idSize * i, idSize);
+ if (key.equals(getMetadataKey(metadata, id))) {
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ return new Variant(value, metadata, dataStart + offset);
+ }
+ }
+ } else {
+ int low = 0;
+ int high = size - 1;
+ while (low <= high) {
+ // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a
+ // performance optimization, because it can properly handle the case where `low + high`
+ // overflows int.
+ int mid = (low + high) >>> 1;
+ int id = readUnsigned(value, idStart + idSize * mid, idSize);
+ int cmp = getMetadataKey(metadata, id).compareTo(key);
+ if (cmp < 0) {
+ low = mid + 1;
+ } else if (cmp > 0) {
+ high = mid - 1;
+ } else {
+ int offset = readUnsigned(value, offsetStart + offsetSize * mid, offsetSize);
+ return new Variant(value, metadata, dataStart + offset);
+ }
+ }
+ }
+ return null;
+ });
+ }
+
+ /**
+ * A field in a Variant object.
+ */
+ public static final class ObjectField {
+ public final String key;
+ public final Variant value;
+
+ public ObjectField(String key, Variant value) {
+ this.key = key;
+ this.value = value;
+ }
+ }
+
+ // Get the object field at the `index` slot. Return null if `index` is out of the bound of
+ // `[0, objectSize())`.
+ // It is only legal to call it when `getType()` is `Type.OBJECT`.
+ /**
+ * Returns the object field at the `index` slot. Return null if `index` is out of the bound of
+ * `[0, objectSize())`. `getType()` must be `Type.OBJECT`.
+ * @param index the index of the object field to get
+ * @return the Objectfield at the `index` slot, or null if `index` is out of bounds
+ */
+ public ObjectField getFieldAtIndex(int index) {
+ return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+ if (index < 0 || index >= size) return null;
+ int id = readUnsigned(value, idStart + idSize * index, idSize);
+ int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
+ String key = getMetadataKey(metadata, id);
+ Variant v = new Variant(value, metadata, dataStart + offset);
+ return new ObjectField(key, v);
+ });
+ }
+
+ /**
+ * Returns the dictionary ID for the object field at the `index` slot.
+ * `getType()` must be `Type.OBJECT`.
+ * @param index the index of the object field to get the dictionary ID for
+ * @return the dictionary ID for the object field at the `index` slot
+ * @throws MalformedVariantException if `index` is out of bounds
+ */
+ public int getDictionaryIdAtIndex(int index) {
+ return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+ if (index < 0 || index >= size) {
+ throw malformedVariant();
+ }
+ return readUnsigned(value, idStart + idSize * index, idSize);
+ });
+ }
+
+ /**
+ * @return the number of array elements. `getType()` must be `Type.ARRAY`.
+ */
+ public int arraySize() {
+ return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size);
+ }
+
+ /**
+ * Returns the array element Variant value at the `index` slot. Returns null if `index` is
+ * out of the bound of `[0, arraySize())`. `getType()` must be `Type.ARRAY`.
+ * @param index the index of the array element to get
+ * @return the array element Variant at the `index` slot, or null if `index` is out of bounds
+ */
+ public Variant getElementAtIndex(int index) {
+ return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> {
+ if (index < 0 || index >= size) return null;
+ int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
+ return new Variant(value, metadata, dataStart + offset);
+ });
+ }
+
+ /**
+ * @param zoneId The ZoneId to use for formatting timestamps
+ * @return the JSON representation of the variant
+ * @throws MalformedVariantException if the variant is malformed
+ */
+ public String toJson(ZoneId zoneId) {
+ return toJson(zoneId, false);
+ }
+
+ /**
+ * @param zoneId The ZoneId to use for formatting timestamps
+ * @param truncateTrailingZeros Whether to truncate trailing zeros in decimal values or timestamps
+ * @return the JSON representation of the variant
+ * @throws MalformedVariantException if the variant is malformed
+ */
+ public String toJson(ZoneId zoneId, boolean truncateTrailingZeros) {
+ StringBuilder sb = new StringBuilder();
+ toJsonImpl(value, metadata, pos, sb, zoneId, truncateTrailingZeros);
+ return sb.toString();
+ }
+
+ /**
+ * Escapes a string so that it can be pasted into a JSON structure. For example, if `str`
+ * only contains a new-line character, then the result is "\n" (4 characters)
+ * @param str the string to escape
+ * @return the escaped string
+ */
+ private static String escapeJson(String str) {
+ try (CharArrayWriter writer = new CharArrayWriter();
+ JsonGenerator gen = new JsonFactory().createGenerator(writer)) {
+ gen.writeString(str);
+ gen.flush();
+ return writer.toString();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Appends a quoted string to a StringBuilder. It is used when we know `str` doesn't contain any
+ * special characters that needs escaping. This is more performant than
+ * `sb.append(escapeJson(str))`.
+ * @param sb the StringBuilder to append to
+ * @param str the string to append
+ */
+ private static void appendQuoted(StringBuilder sb, String str) {
+ sb.append('"');
+ sb.append(str);
+ sb.append('"');
+ }
+
+ /** The format for a timestamp without time zone. */
+ private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder()
+ .append(DateTimeFormatter.ISO_LOCAL_DATE)
+ .appendLiteral('T')
+ .appendPattern("HH:mm:ss")
+ .appendFraction(MICRO_OF_SECOND, 6, 6, true)
+ .toFormatter(Locale.US);
+
+ /** The format for a timestamp with time zone. */
+ private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder()
+ .append(TIMESTAMP_NTZ_FORMATTER)
+ .appendOffset("+HH:MM", "+00:00")
+ .toFormatter(Locale.US);
+
+ /** The format for a timestamp without time zone, truncating trailing microsecond zeros. */
+ private static final DateTimeFormatter TIMESTAMP_NTZ_TRUNC_FORMATTER =
+ new DateTimeFormatterBuilder()
+ .append(DateTimeFormatter.ISO_LOCAL_DATE)
+ .appendLiteral('T')
+ .appendPattern("HH:mm:ss")
+ .optionalStart()
+ .appendFraction(MICRO_OF_SECOND, 0, 6, true)
+ .optionalEnd()
+ .toFormatter(Locale.US);
+
+ /** The format for a timestamp with time zone, truncating trailing microsecond zeros. */
+ private static final DateTimeFormatter TIMESTAMP_TRUNC_FORMATTER = new DateTimeFormatterBuilder()
+ .append(TIMESTAMP_NTZ_TRUNC_FORMATTER)
+ .appendOffset("+HH:MM", "+00:00")
+ .toFormatter(Locale.US);
+
+ private static Instant microsToInstant(long microsSinceEpoch) {
+ return Instant.EPOCH.plus(microsSinceEpoch, ChronoUnit.MICROS);
+ }
+
+ private static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb,
+ ZoneId zoneId, boolean truncateTrailingZeros) {
+ switch (VariantUtil.getType(value, pos)) {
+ case OBJECT:
+ handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+ sb.append('{');
+ for (int i = 0; i < size; ++i) {
+ int id = readUnsigned(value, idStart + idSize * i, idSize);
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ int elementPos = dataStart + offset;
+ if (i != 0) sb.append(',');
+ sb.append(escapeJson(getMetadataKey(metadata, id)));
+ sb.append(':');
+ toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros);
+ }
+ sb.append('}');
+ return null;
+ });
+ break;
+ case ARRAY:
+ handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> {
+ sb.append('[');
+ for (int i = 0; i < size; ++i) {
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ int elementPos = dataStart + offset;
+ if (i != 0) sb.append(',');
+ toJsonImpl(value, metadata, elementPos, sb, zoneId, truncateTrailingZeros);
+ }
+ sb.append(']');
+ return null;
+ });
+ break;
+ case NULL:
+ sb.append("null");
+ break;
+ case BOOLEAN:
+ sb.append(VariantUtil.getBoolean(value, pos));
+ break;
+ case LONG:
+ sb.append(VariantUtil.getLong(value, pos));
+ break;
+ case STRING:
+ sb.append(escapeJson(VariantUtil.getString(value, pos)));
+ break;
+ case DOUBLE:
+ sb.append(VariantUtil.getDouble(value, pos));
+ break;
+ case DECIMAL:
+ if (truncateTrailingZeros) {
+ sb.append(VariantUtil.getDecimal(value, pos).stripTrailingZeros().toPlainString());
+ } else {
+ sb.append(VariantUtil.getDecimal(value, pos).toPlainString());
+ }
+ break;
+ case DATE:
+ appendQuoted(sb, LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)).toString());
+ break;
+ case TIMESTAMP:
+ if (truncateTrailingZeros) {
+ appendQuoted(sb, TIMESTAMP_TRUNC_FORMATTER.format(
+ microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId)));
+ } else {
+ appendQuoted(sb, TIMESTAMP_FORMATTER.format(
+ microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId)));
+ }
+ break;
+ case TIMESTAMP_NTZ:
+ if (truncateTrailingZeros) {
+ appendQuoted(sb, TIMESTAMP_NTZ_TRUNC_FORMATTER.format(
+ microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC)));
+ } else {
+ appendQuoted(sb, TIMESTAMP_NTZ_FORMATTER.format(
+ microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC)));
+ }
+ break;
+ case FLOAT:
+ sb.append(VariantUtil.getFloat(value, pos));
+ break;
+ case BINARY:
+ appendQuoted(sb, Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos)));
+ break;
+ }
+ }
+}
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java
new file mode 100644
index 0000000000..574c8fdbde
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java
@@ -0,0 +1,631 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.exc.InputCoercionException;
+
+import static org.apache.parquet.variant.VariantUtil.*;
+
+/**
+ * Builder for creating Variant value and metadata.
+ */
+public class VariantBuilder {
+ public VariantBuilder(boolean allowDuplicateKeys) {
+ this(allowDuplicateKeys, DEFAULT_SIZE_LIMIT);
+ }
+
+ public VariantBuilder(boolean allowDuplicateKeys, int sizeLimitBytes) {
+ this.allowDuplicateKeys = allowDuplicateKeys;
+ this.sizeLimitBytes = sizeLimitBytes;
+ }
+
+ /**
+ * Parse a JSON string as a Variant value.
+ * @param json the JSON string to parse
+ * @return the Variant value
+ * @throws IOException if any JSON parsing error happens
+ * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed
+ * the size limit
+ */
+ public static Variant parseJson(String json) throws IOException {
+ return parseJson(json, new VariantBuilder(false));
+ }
+
+ /**
+ * Parse a JSON string as a Variant value.
+ * @param json the JSON string to parse
+ * @param builder the VariantBuilder to use for building the Variant
+ * @return the Variant value
+ * @throws IOException if any JSON parsing error happens
+ * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed
+ * the size limit
+ */
+ public static Variant parseJson(String json, VariantBuilder builder) throws IOException {
+ try (JsonParser parser = new JsonFactory().createParser(json)) {
+ parser.nextToken();
+ return parseJson(parser, builder);
+ }
+ }
+
+ /**
+ * Parse a JSON parser as a Variant value.
+ * @param parser the JSON parser to use
+ * @param builder the VariantBuilder to use for building the Variant
+ * @return the Variant value
+ * @throws IOException if any JSON parsing error happens
+ * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed
+ * the size limit
+ */
+ public static Variant parseJson(JsonParser parser, VariantBuilder builder)
+ throws IOException {
+ builder.buildFromJsonParser(parser);
+ return builder.result();
+ }
+
+ /**
+ * @return the Variant value
+ * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed
+ * the size limit
+ */
+ public Variant result() {
+ int numKeys = dictionaryKeys.size();
+ // Use long to avoid overflow in accumulating lengths.
+ long dictionaryStringSize = 0;
+ for (byte[] key : dictionaryKeys) {
+ dictionaryStringSize += key.length;
+ }
+ // Determine the number of bytes required per offset entry.
+ // The largest offset is the one-past-the-end value, which is total string size. It's very
+ // unlikely that the number of keys could be larger, but incorporate that into the calculation
+ // in case of pathological data.
+ long maxSize = Math.max(dictionaryStringSize, numKeys);
+ if (maxSize > sizeLimitBytes) {
+ throw new VariantSizeLimitException();
+ }
+ int offsetSize = getMinIntegerSize((int)maxSize);
+
+ int offsetStart = 1 + offsetSize;
+ int stringStart = offsetStart + (numKeys + 1) * offsetSize;
+ long metadataSize = stringStart + dictionaryStringSize;
+
+ if (metadataSize > sizeLimitBytes) {
+ throw new VariantSizeLimitException();
+ }
+ byte[] metadata = new byte[(int) metadataSize];
+ int headerByte = VERSION | ((offsetSize - 1) << 6);
+ writeLong(metadata, 0, headerByte, 1);
+ writeLong(metadata, 1, numKeys, offsetSize);
+ int currentOffset = 0;
+ for (int i = 0; i < numKeys; ++i) {
+ writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize);
+ byte[] key = dictionaryKeys.get(i);
+ System.arraycopy(key, 0, metadata, stringStart + currentOffset, key.length);
+ currentOffset += key.length;
+ }
+ writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize);
+ return new Variant(Arrays.copyOfRange(writeBuffer, 0, writePos), metadata);
+ }
+
+ public void appendString(String str) {
+ byte[] text = str.getBytes(StandardCharsets.UTF_8);
+ boolean longStr = text.length > MAX_SHORT_STR_SIZE;
+ checkCapacity((longStr ? 1 + U32_SIZE : 1) + text.length);
+ if (longStr) {
+ writeBuffer[writePos++] = primitiveHeader(LONG_STR);
+ writeLong(writeBuffer, writePos, text.length, U32_SIZE);
+ writePos += U32_SIZE;
+ } else {
+ writeBuffer[writePos++] = shortStrHeader(text.length);
+ }
+ System.arraycopy(text, 0, writeBuffer, writePos, text.length);
+ writePos += text.length;
+ }
+
+ public void appendNull() {
+ checkCapacity(1);
+ writeBuffer[writePos++] = primitiveHeader(NULL);
+ }
+
+ public void appendBoolean(boolean b) {
+ checkCapacity(1);
+ writeBuffer[writePos++] = primitiveHeader(b ? TRUE : FALSE);
+ }
+
+ /**
+ * Appends a long value to the variant builder. The actual encoded integer type depends on the
+ * value range of the long value.
+ * @param l the long value to append
+ */
+ public void appendLong(long l) {
+ checkCapacity(1 + 8);
+ if (l == (byte) l) {
+ writeBuffer[writePos++] = primitiveHeader(INT1);
+ writeLong(writeBuffer, writePos, l, 1);
+ writePos += 1;
+ } else if (l == (short) l) {
+ writeBuffer[writePos++] = primitiveHeader(INT2);
+ writeLong(writeBuffer, writePos, l, 2);
+ writePos += 2;
+ } else if (l == (int) l) {
+ writeBuffer[writePos++] = primitiveHeader(INT4);
+ writeLong(writeBuffer, writePos, l, 4);
+ writePos += 4;
+ } else {
+ writeBuffer[writePos++] = primitiveHeader(INT8);
+ writeLong(writeBuffer, writePos, l, 8);
+ writePos += 8;
+ }
+ }
+
+ public void appendDouble(double d) {
+ checkCapacity(1 + 8);
+ writeBuffer[writePos++] = primitiveHeader(DOUBLE);
+ writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8);
+ writePos += 8;
+ }
+
+ /**
+ * Appends a decimal value to the variant builder. The actual encoded decimal type depends on the
+ * precision and scale of the decimal value.
+ * @param d the decimal value to append
+ */
+ public void appendDecimal(BigDecimal d) {
+ checkCapacity(2 + 16);
+ BigInteger unscaled = d.unscaledValue();
+ if (d.scale() <= MAX_DECIMAL4_PRECISION && d.precision() <= MAX_DECIMAL4_PRECISION) {
+ writeBuffer[writePos++] = primitiveHeader(DECIMAL4);
+ writeBuffer[writePos++] = (byte) d.scale();
+ writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4);
+ writePos += 4;
+ } else if (d.scale() <= MAX_DECIMAL8_PRECISION && d.precision() <= MAX_DECIMAL8_PRECISION) {
+ writeBuffer[writePos++] = primitiveHeader(DECIMAL8);
+ writeBuffer[writePos++] = (byte) d.scale();
+ writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8);
+ writePos += 8;
+ } else {
+ assert d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION;
+ writeBuffer[writePos++] = primitiveHeader(DECIMAL16);
+ writeBuffer[writePos++] = (byte) d.scale();
+ // `toByteArray` returns a big-endian representation. We need to copy it reversely and sign
+ // extend it to 16 bytes.
+ byte[] bytes = unscaled.toByteArray();
+ for (int i = 0; i < bytes.length; ++i) {
+ writeBuffer[writePos + i] = bytes[bytes.length - 1 - i];
+ }
+ byte sign = (byte) (bytes[0] < 0 ? -1 : 0);
+ for (int i = bytes.length; i < 16; ++i) {
+ writeBuffer[writePos + i] = sign;
+ }
+ writePos += 16;
+ }
+ }
+
+ public void appendDate(int daysSinceEpoch) {
+ checkCapacity(1 + 4);
+ writeBuffer[writePos++] = primitiveHeader(DATE);
+ writeLong(writeBuffer, writePos, daysSinceEpoch, 4);
+ writePos += 4;
+ }
+
+ public void appendTimestamp(long microsSinceEpoch) {
+ checkCapacity(1 + 8);
+ writeBuffer[writePos++] = primitiveHeader(TIMESTAMP);
+ writeLong(writeBuffer, writePos, microsSinceEpoch, 8);
+ writePos += 8;
+ }
+
+ public void appendTimestampNtz(long microsSinceEpoch) {
+ checkCapacity(1 + 8);
+ writeBuffer[writePos++] = primitiveHeader(TIMESTAMP_NTZ);
+ writeLong(writeBuffer, writePos, microsSinceEpoch, 8);
+ writePos += 8;
+ }
+
+ public void appendFloat(float f) {
+ checkCapacity(1 + 4);
+ writeBuffer[writePos++] = primitiveHeader(FLOAT);
+ writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 8);
+ writePos += 4;
+ }
+
+ public void appendBinary(byte[] binary) {
+ checkCapacity(1 + U32_SIZE + binary.length);
+ writeBuffer[writePos++] = primitiveHeader(BINARY);
+ writeLong(writeBuffer, writePos, binary.length, U32_SIZE);
+ writePos += U32_SIZE;
+ System.arraycopy(binary, 0, writeBuffer, writePos, binary.length);
+ writePos += binary.length;
+ }
+
+ /**
+ * Adds a key to the Variant dictionary. If the key already exists, the dictionary is unmodified.
+ * @param key the key to add
+ * @return the id of the key
+ */
+ public int addKey(String key) {
+ int id;
+ if (dictionary.containsKey(key)) {
+ id = dictionary.get(key);
+ } else {
+ id = dictionaryKeys.size();
+ dictionary.put(key, id);
+ dictionaryKeys.add(key.getBytes(StandardCharsets.UTF_8));
+ }
+ return id;
+ }
+
+ /**
+ * @return the current write position of the variant builder
+ */
+ public int getWritePos() {
+ return writePos;
+ }
+
+ // Finish writing a variant object after all of its fields have already been written. The process
+ // is as follows:
+ // 1. The caller calls `getWritePos` before writing any fields to obtain the `start` parameter.
+ // 2. The caller appends all the object fields to the builder. In the meantime, it should maintain
+ // the `fields` parameter. Before appending each field, it should append an entry to `fields` to
+ // record the offset of the field. The offset is computed as `getWritePos() - start`.
+ // 3. The caller calls `finishWritingObject` to finish writing a variant object.
+ //
+ // This function is responsible to sort the fields by key. If there are duplicate field keys:
+ // - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last
+ // appended one) is kept.
+ // - otherwise, throw an exception.
+ /**
+ * Finish writing a Variant object after all of its fields have already been written. The process
+ * is as follows:
+ * 1. The caller calls `getWritePos()` before writing any fields to obtain the `start` parameter.
+ * 2. The caller appends all the object fields to the builder. In the meantime, it should maintain
+ * the `fields` parameter. Before appending each field, it should append an entry to `fields` to
+ * record the offset of the field. The offset is computed as `getWritePos() - start`.
+ * 3. The caller calls `finishWritingObject` to finish writing the Variant object.
+ *
+ * This method will sort the fields by key. If there are duplicate field keys:
+ * - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last
+ * appended one) is kept.
+ * - otherwise, throw an exception.
+ * @param start the start position of the object in the write buffer
+ * @param fields the list of `FieldEntry` in the object
+ * @throws VariantDuplicateKeyException if there are duplicate keys and `allowDuplicateKeys` is
+ * false
+ */
+ public void finishWritingObject(int start, ArrayList fields) {
+ int size = fields.size();
+ Collections.sort(fields);
+ int maxId = size == 0 ? 0 : fields.get(0).id;
+ if (allowDuplicateKeys) {
+ int distinctPos = 0;
+ // Maintain a list of distinct keys in-place.
+ for (int i = 1; i < size; ++i) {
+ maxId = Math.max(maxId, fields.get(i).id);
+ if (fields.get(i).id == fields.get(i - 1).id) {
+ // Found a duplicate key. Keep the field with the greater offset.
+ if (fields.get(distinctPos).offset < fields.get(i).offset) {
+ fields.set(distinctPos, fields.get(distinctPos).withNewOffset(fields.get(i).offset));
+ }
+ } else {
+ // Found a distinct key. Add the field to the list.
+ ++distinctPos;
+ fields.set(distinctPos, fields.get(i));
+ }
+ }
+ if (distinctPos + 1 < fields.size()) {
+ size = distinctPos + 1;
+ // Resize `fields` to `size`.
+ fields.subList(size, fields.size()).clear();
+ // Sort the fields by offsets so that we can move the value data of each field to the new
+ // offset without overwriting the fields after it.
+ fields.sort(Comparator.comparingInt(f -> f.offset));
+ int currentOffset = 0;
+ for (int i = 0; i < size; ++i) {
+ int oldOffset = fields.get(i).offset;
+ int fieldSize = VariantUtil.valueSize(writeBuffer, start + oldOffset);
+ System.arraycopy(writeBuffer, start + oldOffset,
+ writeBuffer, start + currentOffset, fieldSize);
+ fields.set(i, fields.get(i).withNewOffset(currentOffset));
+ currentOffset += fieldSize;
+ }
+ writePos = start + currentOffset;
+ // Change back to the sort order by field keys, required by the Variant specification.
+ Collections.sort(fields);
+ }
+ } else {
+ for (int i = 1; i < size; ++i) {
+ maxId = Math.max(maxId, fields.get(i).id);
+ String key = fields.get(i).key;
+ if (key.equals(fields.get(i - 1).key)) {
+ throw new VariantDuplicateKeyException(key);
+ }
+ }
+ }
+ int dataSize = writePos - start;
+ boolean largeSize = size > U8_MAX;
+ int sizeBytes = largeSize ? U32_SIZE : 1;
+ int idSize = getMinIntegerSize(maxId);
+ int offsetSize = getMinIntegerSize(dataSize);
+ // The space for header byte, object size, id list, and offset list.
+ int headerSize = 1 + sizeBytes + size * idSize + (size + 1) * offsetSize;
+ checkCapacity(headerSize);
+ // Shift the just-written field data to make room for the object header section.
+ System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize);
+ writePos += headerSize;
+ writeBuffer[start] = objectHeader(largeSize, idSize, offsetSize);
+ writeLong(writeBuffer, start + 1, size, sizeBytes);
+ int idStart = start + 1 + sizeBytes;
+ int offsetStart = idStart + size * idSize;
+ for (int i = 0; i < size; ++i) {
+ writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize);
+ writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize);
+ }
+ writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize);
+ }
+
+ /**
+ * Finish writing a Variant array after all of its elements have already been written. The process
+ * is similar to that of `finishWritingObject`.
+ * @param start the start position of the array in the write buffer
+ * @param offsets the list of offsets of the array elements
+ */
+ public void finishWritingArray(int start, ArrayList offsets) {
+ int dataSize = writePos - start;
+ int size = offsets.size();
+ boolean largeSize = size > U8_MAX;
+ int sizeBytes = largeSize ? U32_SIZE : 1;
+ int offsetSize = getMinIntegerSize(dataSize);
+ // The space for header byte, object size, and offset list.
+ int headerSize = 1 + sizeBytes + (size + 1) * offsetSize;
+ checkCapacity(headerSize);
+ // Shift the just-written field data to make room for the header section.
+ System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize);
+ writePos += headerSize;
+ writeBuffer[start] = arrayHeader(largeSize, offsetSize);
+ writeLong(writeBuffer, start + 1, size, sizeBytes);
+ int offsetStart = start + 1 + sizeBytes;
+ for (int i = 0; i < size; ++i) {
+ writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize);
+ }
+ writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize);
+ }
+
+ /**
+ * Appends a Variant value to the Variant builder. The input Variant keys must be inserted into
+ * the builder dictionary and rebuilt with new field ids. For scalar values in the input
+ * Variant, we can directly copy the binary slice.
+ * @param v the Variant value to append
+ */
+ public void appendVariant(Variant v) {
+ appendVariantImpl(v.value, v.metadata, v.pos);
+ }
+
+ private void appendVariantImpl(byte[] value, byte[] metadata, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ switch (basicType) {
+ case OBJECT:
+ handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+ ArrayList fields = new ArrayList<>(size);
+ int start = writePos;
+ for (int i = 0; i < size; ++i) {
+ int id = readUnsigned(value, idStart + idSize * i, idSize);
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ int elementPos = dataStart + offset;
+ String key = getMetadataKey(metadata, id);
+ int newId = addKey(key);
+ fields.add(new FieldEntry(key, newId, writePos - start));
+ appendVariantImpl(value, metadata, elementPos);
+ }
+ finishWritingObject(start, fields);
+ return null;
+ });
+ break;
+ case ARRAY:
+ handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> {
+ ArrayList offsets = new ArrayList<>(size);
+ int start = writePos;
+ for (int i = 0; i < size; ++i) {
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ int elementPos = dataStart + offset;
+ offsets.add(writePos - start);
+ appendVariantImpl(value, metadata, elementPos);
+ }
+ finishWritingArray(start, offsets);
+ return null;
+ });
+ break;
+ default:
+ shallowAppendVariantImpl(value, pos);
+ break;
+ }
+ }
+
+ private void shallowAppendVariantImpl(byte[] value, int pos) {
+ int size = valueSize(value, pos);
+ checkIndex(pos + size - 1, value.length);
+ checkCapacity(size);
+ System.arraycopy(value, pos, writeBuffer, writePos, size);
+ writePos += size;
+ }
+
+ private void checkCapacity(int additionalBytes) {
+ int requiredBytes = writePos + additionalBytes;
+ if (requiredBytes > writeBuffer.length) {
+ // Allocate a new buffer with a capacity of the next power of 2 of `requiredBytes`.
+ int newCapacity = Integer.highestOneBit(requiredBytes);
+ newCapacity = newCapacity < requiredBytes ? newCapacity * 2 : newCapacity;
+ if (newCapacity > sizeLimitBytes) {
+ throw new VariantSizeLimitException();
+ }
+ byte[] newValue = new byte[newCapacity];
+ System.arraycopy(writeBuffer, 0, newValue, 0, writePos);
+ writeBuffer = newValue;
+ }
+ }
+
+ // Temporarily store the information of a field. We need to collect all fields in an JSON object,
+ // sort them by their keys, and build the variant object in sorted order.
+
+ /**
+ * Class to store the information of a Variant object field. We need to collect all fields of
+ * an object, sort them by their keys, and build the Variant object in sorted order.
+ */
+ public static final class FieldEntry implements Comparable {
+ final String key;
+ final int id;
+ final int offset;
+
+ public FieldEntry(String key, int id, int offset) {
+ this.key = key;
+ this.id = id;
+ this.offset = offset;
+ }
+
+ FieldEntry withNewOffset(int newOffset) {
+ return new FieldEntry(key, id, newOffset);
+ }
+
+ @Override
+ public int compareTo(FieldEntry other) {
+ return key.compareTo(other.key);
+ }
+ }
+
+ private void buildFromJsonParser(JsonParser parser) throws IOException {
+ JsonToken token = parser.currentToken();
+ if (token == null) {
+ throw new JsonParseException(parser, "Unexpected null token");
+ }
+ switch (token) {
+ case START_OBJECT: {
+ ArrayList fields = new ArrayList<>();
+ int start = writePos;
+ while (parser.nextToken() != JsonToken.END_OBJECT) {
+ String key = parser.currentName();
+ parser.nextToken();
+ int id = addKey(key);
+ fields.add(new FieldEntry(key, id, writePos - start));
+ buildFromJsonParser(parser);
+ }
+ finishWritingObject(start, fields);
+ break;
+ }
+ case START_ARRAY: {
+ ArrayList offsets = new ArrayList<>();
+ int start = writePos;
+ while (parser.nextToken() != JsonToken.END_ARRAY) {
+ offsets.add(writePos - start);
+ buildFromJsonParser(parser);
+ }
+ finishWritingArray(start, offsets);
+ break;
+ }
+ case VALUE_STRING:
+ appendString(parser.getText());
+ break;
+ case VALUE_NUMBER_INT:
+ try {
+ appendLong(parser.getLongValue());
+ } catch (InputCoercionException ignored) {
+ // If the value doesn't fit any integer type, parse it as decimal or floating instead.
+ parseAndAppendFloatingPoint(parser);
+ }
+ break;
+ case VALUE_NUMBER_FLOAT:
+ parseAndAppendFloatingPoint(parser);
+ break;
+ case VALUE_TRUE:
+ appendBoolean(true);
+ break;
+ case VALUE_FALSE:
+ appendBoolean(false);
+ break;
+ case VALUE_NULL:
+ appendNull();
+ break;
+ default:
+ throw new JsonParseException(parser, "Unexpected token " + token);
+ }
+ }
+
+ /**
+ * Returns the size (number of bytes) of the smallest unsigned integer type that can store
+ * `value`. It must be within `[0, U24_MAX]`.
+ * @param value the value to get the size for
+ * @return the size (number of bytes) of the smallest unsigned integer type that can store `value`
+ */
+ private int getMinIntegerSize(int value) {
+ assert value >= 0 && value <= U24_MAX;
+ if (value <= U8_MAX) return 1;
+ if (value <= U16_MAX) return 2;
+ return U24_SIZE;
+ }
+
+ /**
+ * Parse a JSON number as a floating point value. If the number can be parsed as a decimal, it
+ * will be appended as a decimal value. Otherwise, it will be appended as a double value.
+ * @param parser the JSON parser to use
+ */
+ private void parseAndAppendFloatingPoint(JsonParser parser) throws IOException {
+ if (!tryParseDecimal(parser.getText())) {
+ appendDouble(parser.getDoubleValue());
+ }
+ }
+
+ /**
+ * Try to parse a JSON number as a decimal. The input must only use the decimal format
+ * (an integer value with an optional '.' in it) and must not use scientific notation. It also
+ * must fit into the precision limitation of decimal types.
+ * @param input the input string to parse as decimal
+ * @return whether the parsing succeeds
+ */
+ private boolean tryParseDecimal(String input) {
+ for (int i = 0; i < input.length(); ++i) {
+ char ch = input.charAt(i);
+ if (ch != '-' && ch != '.' && !(ch >= '0' && ch <= '9')) {
+ return false;
+ }
+ }
+ BigDecimal d = new BigDecimal(input);
+ if (d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION) {
+ appendDecimal(d);
+ return true;
+ }
+ return false;
+ }
+
+ /** The buffer for building the Variant value. The first `writePos` bytes have been written. */
+ private byte[] writeBuffer = new byte[128];
+ private int writePos = 0;
+ /** The dictionary for mapping keys to monotonically increasing ids. */
+ private final HashMap dictionary = new HashMap<>();
+ /** The keys in the dictionary, in id order. */
+ private final ArrayList dictionaryKeys = new ArrayList<>();
+
+ private final boolean allowDuplicateKeys;
+ private final int sizeLimitBytes;
+}
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java
new file mode 100644
index 0000000000..12e94416c4
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantDuplicateKeyException.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+/**
+ * An exception indicating that the Variant contains a duplicate key.
+ */
+public class VariantDuplicateKeyException extends RuntimeException {
+ public final String key;
+
+ /**
+ * @param key the key that was duplicated
+ */
+ public VariantDuplicateKeyException(String key) {
+ super("Failed to build Variant because of duplicate object key: " + key);
+ this.key = key;
+ }
+
+ /**
+ * @return the key that was duplicated
+ */
+ public String getKey() {
+ return key;
+ }
+}
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java
new file mode 100644
index 0000000000..08556e762e
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantSizeLimitException.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+/**
+ * An exception indicating that the metadata or data size of the Variant exceeds the
+ * configured size limit.
+ */
+public class VariantSizeLimitException extends RuntimeException {
+}
diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
new file mode 100644
index 0000000000..aeebfe67e1
--- /dev/null
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
@@ -0,0 +1,646 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides functions for
+ * manipulating Variant binaries.
+
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in
+ * the below constants for all possible basic type and type info values.
+
+ * The Variant metadata includes a version id and a dictionary of distinct strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the
+ * dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the
+ * starting position of string i, counting starting from the address of `offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+public class VariantUtil {
+ public static final int BASIC_TYPE_BITS = 2;
+ public static final int BASIC_TYPE_MASK = 0x3;
+ public static final int TYPE_INFO_MASK = 0x3F;
+ /** The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. */
+ public static final int MAX_SHORT_STR_SIZE = 0x3F;
+
+ // The basic types
+
+ /**
+ * Primitive value.
+ * The type info value must be one of the values in the "Primitive" section below.
+ */
+ public static final int PRIMITIVE = 0;
+ /**
+ * Short string value.
+ * The type info value is the string size, which must be in `[0, MAX_SHORT_STR_SIZE]`.
+ * The string content bytes directly follow the header byte.
+ */
+ public static final int SHORT_STR = 1;
+ /**
+ * Object value.
+ * The content contains a size, a list of field ids, a list of field offsets, and
+ * the actual field values. The list of field ids has `size` ids, while the list of field offsets
+ * has `size + 1` offsets, where the last offset represents the total size of the field values
+ * data. The list of fields ids must be sorted by the field name in alphabetical order.
+ * Duplicate field names within one object are not allowed.
+ * 5 bits in the type info are used to specify the integer type of the object header. It is
+ * 0_b4_b3b2_b1b0 (MSB is 0), where:
+ * - b4: the integer type of size. When it is 0/1, `size` is a little-endian 1/4-byte
+ * unsigned integer.
+ * - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list contains
+ * 1/2/3-byte little-endian unsigned integers.
+ * - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the offset list contains
+ * 1/2/3-byte little-endian unsigned integers.
+ */
+ public static final int OBJECT = 2;
+ /**
+ * Array value.
+ * The content contains a size, a list of field offsets, and the actual element values.
+ * It is similar to an object without the id list. The length of the offset list
+ * is `size + 1`, where the last offset represent the total size of the element data.
+ * Its type info is: 000_b2_b1b0:
+ * - b2: the type of size.
+ * - b1b0: the integer type of offset.
+ */
+ public static final int ARRAY = 3;
+
+ // The primitive types
+
+ /** JSON Null value. Empty content. */
+ public static final int NULL = 0;
+ /** True value. Empty content. */
+ public static final int TRUE = 1;
+ /** False value. Empty content. */
+ public static final int FALSE = 2;
+ /** 1-byte little-endian signed integer. */
+ public static final int INT1 = 3;
+ /** 2-byte little-endian signed integer. */
+ public static final int INT2 = 4;
+ /** 4-byte little-endian signed integer. */
+ public static final int INT4 = 5;
+ /** 4-byte little-endian signed integer. */
+ public static final int INT8 = 6;
+ /** 8-byte IEEE double. */
+ public static final int DOUBLE = 7;
+ /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. */
+ public static final int DECIMAL4 = 8;
+ /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. */
+ public static final int DECIMAL8 = 9;
+ /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. */
+ public static final int DECIMAL16 = 10;
+ /**
+ * Date value. Content is 4-byte little-endian signed integer that represents the
+ * number of days from the Unix epoch.
+ */
+ public static final int DATE = 11;
+ /**
+ * Timestamp value. Content is 8-byte little-endian signed integer that represents the number of
+ * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in
+ * their local time zones and may be displayed differently depending on the execution environment.
+ */
+ public static final int TIMESTAMP = 12;
+ /**
+ * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted
+ * as if the local time zone is UTC.
+ */
+ public static final int TIMESTAMP_NTZ = 13;
+ /** 4-byte IEEE float. */
+ public static final int FLOAT = 14;
+ /**
+ * Binary value. The content is (4-byte little-endian unsigned integer representing the binary
+ * size) + (size bytes of binary content).
+ */
+ public static final int BINARY = 15;
+ /**
+ * Long string value. The content is (4-byte little-endian unsigned integer representing the
+ * string size) + (size bytes of string content).
+ */
+ public static final int LONG_STR = 16;
+
+ // The metadata version.
+ public static final byte VERSION = 1;
+ // The lower 4 bits of the first metadata byte contain the version.
+ public static final byte VERSION_MASK = 0x0F;
+
+ // Constants for various unsigned integer sizes.
+ public static final int U8_MAX = 0xFF;
+ public static final int U16_MAX = 0xFFFF;
+ public static final int U24_MAX = 0xFFFFFF;
+ public static final int U24_SIZE = 3;
+ public static final int U32_SIZE = 4;
+
+ // Max decimal precision for each decimal type.
+ public static final int MAX_DECIMAL4_PRECISION = 9;
+ public static final int MAX_DECIMAL8_PRECISION = 18;
+ public static final int MAX_DECIMAL16_PRECISION = 38;
+
+ // Default size limit for both variant value and variant metadata.
+ public static final int DEFAULT_SIZE_LIMIT = U24_MAX + 1;
+
+ /**
+ * Write the least significant `numBytes` bytes in `value` into `bytes[pos, pos + numBytes)` in
+ * little endian.
+ * @param bytes The byte array to write into
+ * @param pos The starting index of the byte array to write into
+ * @param value The value to write
+ * @param numBytes The number of bytes to write
+ */
+ public static void writeLong(byte[] bytes, int pos, long value, int numBytes) {
+ for (int i = 0; i < numBytes; ++i) {
+ bytes[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF);
+ }
+ }
+
+ public static byte primitiveHeader(int type) {
+ return (byte) (type << 2 | PRIMITIVE);
+ }
+
+ public static byte shortStrHeader(int size) {
+ return (byte) (size << 2 | SHORT_STR);
+ }
+
+ public static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+ return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) |
+ ((idSize - 1) << (BASIC_TYPE_BITS + 2)) |
+ ((offsetSize - 1) << BASIC_TYPE_BITS) | OBJECT);
+ }
+
+ public static byte arrayHeader(boolean largeSize, int offsetSize) {
+ return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) |
+ ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+ }
+
+ public static MalformedVariantException malformedVariant() {
+ return new MalformedVariantException();
+ }
+
+ public static UnknownVariantTypeException unknownPrimitiveTypeInVariant(int id) {
+ return new UnknownVariantTypeException(id);
+ }
+
+ /**
+ * Check the validity of an array index `pos`.
+ * @param pos The index to check
+ * @param length The length of the array
+ * @throws MalformedVariantException if the index is out of bound
+ */
+ public static void checkIndex(int pos, int length) {
+ if (pos < 0 || pos >= length) throw malformedVariant();
+ }
+
+ /**
+ * Reads a little-endian signed long value from `bytes[pos, pos + numBytes)`.
+ * @param bytes The byte array to read from
+ * @param pos The starting index of the byte array to read from
+ * @param numBytes The number of bytes to read
+ * @return The long value
+ */
+ static long readLong(byte[] bytes, int pos, int numBytes) {
+ checkIndex(pos, bytes.length);
+ checkIndex(pos + numBytes - 1, bytes.length);
+ long result = 0;
+ // All bytes except the most significant byte should be unsigned-extended and shifted
+ // (so we need & 0xFF`). The most significant byte should be sign-extended and is handled
+ // after the loop.
+ for (int i = 0; i < numBytes - 1; ++i) {
+ long unsignedByteValue = bytes[pos + i] & 0xFF;
+ result |= unsignedByteValue << (8 * i);
+ }
+ long signedByteValue = bytes[pos + numBytes - 1];
+ result |= signedByteValue << (8 * (numBytes - 1));
+ return result;
+ }
+
+ /**
+ * Read a little-endian unsigned int value from `bytes[pos, pos + numBytes)`. The value must fit
+ * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+ */
+ static int readUnsigned(byte[] bytes, int pos, int numBytes) {
+ checkIndex(pos, bytes.length);
+ checkIndex(pos + numBytes - 1, bytes.length);
+ int result = 0;
+ // Similar to the `readLong` loop, but all bytes should be unsigned-extended.
+ for (int i = 0; i < numBytes; ++i) {
+ int unsignedByteValue = bytes[pos + i] & 0xFF;
+ result |= unsignedByteValue << (8 * i);
+ }
+ if (result < 0) throw malformedVariant();
+ return result;
+ }
+
+ /**
+ * The value type of Variant value. It is determined by the header byte but not a 1:1 mapping
+ * (for example, INT1/2/4/8 all maps to `Type.LONG`).
+ */
+ public enum Type {
+ OBJECT,
+ ARRAY,
+ NULL,
+ BOOLEAN,
+ LONG,
+ STRING,
+ DOUBLE,
+ DECIMAL,
+ DATE,
+ TIMESTAMP,
+ TIMESTAMP_NTZ,
+ FLOAT,
+ BINARY,
+ }
+
+ public static int getTypeInfo(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ return (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ }
+
+ /**
+ * Returns the value type of Variant value `value[pos...]`. It is only legal to call `get*` if
+ * `getType` returns the corresponding type. For example, it is only legal to call
+ * `getLong` if this method returns `Type.Long`.
+ * @param value The Variant value to get the type from
+ * @param pos The starting index of the Variant value
+ * @return The type of the Variant value
+ */
+ public static Type getType(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ switch (basicType) {
+ case SHORT_STR:
+ return Type.STRING;
+ case OBJECT:
+ return Type.OBJECT;
+ case ARRAY:
+ return Type.ARRAY;
+ default:
+ switch (typeInfo) {
+ case NULL:
+ return Type.NULL;
+ case TRUE:
+ case FALSE:
+ return Type.BOOLEAN;
+ case INT1:
+ case INT2:
+ case INT4:
+ case INT8:
+ return Type.LONG;
+ case DOUBLE:
+ return Type.DOUBLE;
+ case DECIMAL4:
+ case DECIMAL8:
+ case DECIMAL16:
+ return Type.DECIMAL;
+ case DATE:
+ return Type.DATE;
+ case TIMESTAMP:
+ return Type.TIMESTAMP;
+ case TIMESTAMP_NTZ:
+ return Type.TIMESTAMP_NTZ;
+ case FLOAT:
+ return Type.FLOAT;
+ case BINARY:
+ return Type.BINARY;
+ case LONG_STR:
+ return Type.STRING;
+ default:
+ throw unknownPrimitiveTypeInVariant(typeInfo);
+ }
+ }
+ }
+
+ /**
+ * Computes the actual size (in bytes) of the Variant value at `value[pos...]`.
+ * `value.length - pos` is an upper bound of the size, but the actual size may be smaller.
+ * @param value The Variant value
+ * @param pos The starting index of the Variant value
+ * @return The actual size of the Variant value
+ * @throws MalformedVariantException if the Variant is malformed
+ */
+ public static int valueSize(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ switch (basicType) {
+ case SHORT_STR:
+ return 1 + typeInfo;
+ case OBJECT:
+ return handleObject(value, pos,
+ (size, idSize, offsetSize, idStart, offsetStart, dataStart) ->
+ dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize));
+ case ARRAY:
+ return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) ->
+ dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize));
+ default:
+ switch (typeInfo) {
+ case NULL:
+ case TRUE:
+ case FALSE:
+ return 1;
+ case INT1:
+ return 2;
+ case INT2:
+ return 3;
+ case INT4:
+ case DATE:
+ case FLOAT:
+ return 5;
+ case INT8:
+ case DOUBLE:
+ case TIMESTAMP:
+ case TIMESTAMP_NTZ:
+ return 9;
+ case DECIMAL4:
+ return 6;
+ case DECIMAL8:
+ return 10;
+ case DECIMAL16:
+ return 18;
+ case BINARY:
+ case LONG_STR:
+ return 1 + U32_SIZE + readUnsigned(value, pos + 1, U32_SIZE);
+ default:
+ throw unknownPrimitiveTypeInVariant(typeInfo);
+ }
+ }
+ }
+
+ private static IllegalStateException unexpectedType(Type type) {
+ return new IllegalStateException("Expect type to be " + type);
+ }
+
+ public static boolean getBoolean(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+ throw unexpectedType(Type.BOOLEAN);
+ }
+ return typeInfo == TRUE;
+ }
+
+ /**
+ * Returns a long value from Variant value `value[pos...]`.
+ * It is only legal to call it if `getType` returns one of Type.LONG, DATE, TIMESTAMP,
+ * TIMESTAMP_NTZ.
+ * If the type is `DATE`, the return value is guaranteed to fit into an int and
+ * represents the number of days from the Unix epoch.
+ * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of
+ * microseconds from the Unix epoch.
+ * @param value The Variant value
+ * @param pos The starting index of the Variant value
+ * @return The long value
+ */
+ public static long getLong(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ";
+ if (basicType != PRIMITIVE) throw new IllegalStateException(exceptionMessage);
+ switch (typeInfo) {
+ case INT1:
+ return readLong(value, pos + 1, 1);
+ case INT2:
+ return readLong(value, pos + 1, 2);
+ case INT4:
+ case DATE:
+ return readLong(value, pos + 1, 4);
+ case INT8:
+ case TIMESTAMP:
+ case TIMESTAMP_NTZ:
+ return readLong(value, pos + 1, 8);
+ default:
+ throw new IllegalStateException(exceptionMessage);
+ }
+ }
+
+ public static double getDouble(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != DOUBLE) throw unexpectedType(Type.DOUBLE);
+ return Double.longBitsToDouble(readLong(value, pos + 1, 8));
+ }
+
+ /**
+ * Checks whether the precision and scale of the decimal are within the limit.
+ * @param d The decimal value to check
+ * @param maxPrecision The maximum precision allowed
+ * @throws MalformedVariantException if the decimal is malformed
+ */
+ private static void checkDecimal(BigDecimal d, int maxPrecision) {
+ if (d.precision() > maxPrecision || d.scale() > maxPrecision) {
+ throw malformedVariant();
+ }
+ }
+
+ public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL);
+ // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be
+ // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`.
+ int scale = value[pos + 1] & 0xFF;
+ BigDecimal result;
+ switch (typeInfo) {
+ case DECIMAL4:
+ result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale);
+ checkDecimal(result, MAX_DECIMAL4_PRECISION);
+ break;
+ case DECIMAL8:
+ result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale);
+ checkDecimal(result, MAX_DECIMAL8_PRECISION);
+ break;
+ case DECIMAL16:
+ checkIndex(pos + 17, value.length);
+ byte[] bytes = new byte[16];
+ // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian
+ // representation.
+ for (int i = 0; i < 16; ++i) {
+ bytes[i] = value[pos + 17 - i];
+ }
+ result = new BigDecimal(new BigInteger(bytes), scale);
+ checkDecimal(result, MAX_DECIMAL16_PRECISION);
+ break;
+ default:
+ throw unexpectedType(Type.DECIMAL);
+ }
+ return result;
+ }
+
+ public static BigDecimal getDecimal(byte[] value, int pos) {
+ return getDecimalWithOriginalScale(value, pos);
+ }
+
+ public static float getFloat(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != FLOAT) throw unexpectedType(Type.FLOAT);
+ return Float.intBitsToFloat((int) readLong(value, pos + 1, 4));
+ }
+
+ public static byte[] getBinary(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != BINARY) throw unexpectedType(Type.BINARY);
+ int start = pos + 1 + U32_SIZE;
+ int length = readUnsigned(value, pos + 1, U32_SIZE);
+ checkIndex(start + length - 1, value.length);
+ return Arrays.copyOfRange(value, start, start + length);
+ }
+
+ public static String getString(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) {
+ int start;
+ int length;
+ if (basicType == SHORT_STR) {
+ start = pos + 1;
+ length = typeInfo;
+ } else {
+ start = pos + 1 + U32_SIZE;
+ length = readUnsigned(value, pos + 1, U32_SIZE);
+ }
+ checkIndex(start + length - 1, value.length);
+ return new String(value, start, length);
+ }
+ throw unexpectedType(Type.STRING);
+ }
+
+ /**
+ * An interface for the Variant object handler.
+ * @param The return type of the handler
+ */
+ public interface ObjectHandler {
+ /**
+ * @param size Number of object fields.
+ * @param idSize The integer size of the field id list.
+ * @param offsetSize The integer size of the offset list.
+ * @param idStart The starting index of the field id list in the variant value array.
+ * @param offsetStart The starting index of the offset list in the variant value array.
+ * @param dataStart The starting index of field data in the variant value array.
+ */
+ T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart);
+ }
+
+ /**
+ * A helper function to access a Variant object, at `value[pos...]`.
+ * @param value The Variant value
+ * @param pos The starting index of the Variant value
+ * @param handler The handler to process the object
+ * @return The result of the handler
+ * @param The return type of the handler
+ */
+ public static T handleObject(byte[] value, int pos, ObjectHandler handler) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != OBJECT) throw unexpectedType(Type.OBJECT);
+ // Refer to the comment of the `OBJECT` constant for the details of the object header encoding.
+ // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts
+ // b4 to determine whether the object uses a 1/4-byte size.
+ boolean largeSize = ((typeInfo >> 4) & 0x1) != 0;
+ int sizeBytes = (largeSize ? U32_SIZE : 1);
+ int size = readUnsigned(value, pos + 1, sizeBytes);
+ // Extracts b3b2 to determine the integer size of the field id list.
+ int idSize = ((typeInfo >> 2) & 0x3) + 1;
+ // Extracts b1b0 to determine the integer size of the offset list.
+ int offsetSize = (typeInfo & 0x3) + 1;
+ int idStart = pos + 1 + sizeBytes;
+ int offsetStart = idStart + size * idSize;
+ int dataStart = offsetStart + (size + 1) * offsetSize;
+ return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart);
+ }
+
+ /**
+ * An interface for the Variant array handler.
+ * @param The return type of the handler
+ */
+ public interface ArrayHandler {
+ /**
+ * @param size Number of array elements.
+ * @param offsetSize The integer size of the offset list.
+ * @param offsetStart The starting index of the offset list in the variant value array.
+ * @param dataStart The starting index of element data in the variant value array.
+ */
+ T apply(int size, int offsetSize, int offsetStart, int dataStart);
+ }
+
+ /**
+ * A helper function to access a Variant array, at `value[pos...]`.
+ * @param value The Variant value
+ * @param pos The starting index of the Variant value
+ * @param handler The handler to process the array
+ * @return The result of the handler
+ * @param The return type of the handler
+ */
+ public static T handleArray(byte[] value, int pos, ArrayHandler handler) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != ARRAY) throw unexpectedType(Type.ARRAY);
+ // Refer to the comment of the `ARRAY` constant for the details of the object header encoding.
+ // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts
+ // b2 to determine whether the object uses a 1/4-byte size.
+ boolean largeSize = ((typeInfo >> 2) & 0x1) != 0;
+ int sizeBytes = (largeSize ? U32_SIZE : 1);
+ int size = readUnsigned(value, pos + 1, sizeBytes);
+ // Extracts b1b0 to determine the integer size of the offset list.
+ int offsetSize = (typeInfo & 0x3) + 1;
+ int offsetStart = pos + 1 + sizeBytes;
+ int dataStart = offsetStart + (size + 1) * offsetSize;
+ return handler.apply(size, offsetSize, offsetStart, dataStart);
+ }
+
+ /**
+ * Returns a key at `id` in the Variant metadata.
+ * @param metadata The Variant metadata
+ * @param id The key id
+ * @return The key
+ * @throws MalformedVariantException if the Variant is malformed or if the id is out of bounds
+ */
+ public static String getMetadataKey(byte[] metadata, int id) {
+ checkIndex(0, metadata.length);
+ // Extracts the highest 2 bits in the metadata header to determine the integer size of the
+ // offset list.
+ int offsetSize = ((metadata[0] >> 6) & 0x3) + 1;
+ int dictSize = readUnsigned(metadata, 1, offsetSize);
+ if (id >= dictSize) throw malformedVariant();
+ // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets
+ // before the string data.
+ int stringStart = 1 + (dictSize + 2) * offsetSize;
+ int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize);
+ int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize);
+ if (offset > nextOffset) throw malformedVariant();
+ checkIndex(stringStart + nextOffset - 1, metadata.length);
+ return new String(metadata, stringStart + offset, nextOffset - offset);
+ }
+}
diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java
new file mode 100644
index 0000000000..0a8740e3c0
--- /dev/null
+++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantEncoding.java
@@ -0,0 +1,491 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.security.SecureRandom;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.util.Arrays;
+import java.util.Base64;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.IntStream;
+import com.fasterxml.jackson.core.*;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class TestVariantEncoding {
+ private static final Logger LOG = LoggerFactory.getLogger(TestVariantEncoding.class);
+ private static final String RANDOM_CHARS =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+ private static final List SAMPLE_JSON_VALUES = Arrays.asList(
+ "null",
+ "true",
+ "false",
+ "12",
+ "-9876543210",
+ "4.5678E123",
+ "8.765E-2",
+ "\"string value\"",
+ "-9876.543",
+ "234.456789",
+ "{\"a\": 1, \"b\": {\"e\": -4, \"f\": 5.5}, \"c\": true}",
+ "[1, -2, 4.5, -6.7, \"str\", true]"
+ );
+
+ /** Random number generator for generating random strings */
+ private static SecureRandom random = new SecureRandom();
+ /** Object mapper for comparing json values */
+ private final ObjectMapper mapper = new ObjectMapper();
+
+ private void checkJson(String jsonValue) {
+ try {
+ StreamReadConstraints.overrideDefaultStreamReadConstraints(
+ StreamReadConstraints.builder().maxNestingDepth(100000).build());
+ Variant v = VariantBuilder.parseJson(jsonValue);
+ Assert.assertEquals(mapper.readTree(jsonValue),
+ mapper.readTree(v.toJson(ZoneId.systemDefault())));
+ } catch (IOException e) {
+ Assert.fail("Failed to parse json: " + jsonValue + " " + e);
+ }
+ }
+
+ private void checkType(Variant v, int expectedBasicType, int expectedTypeInfo) {
+ Assert.assertEquals(expectedBasicType, v.value[v.pos] & VariantUtil.BASIC_TYPE_MASK);
+ Assert.assertEquals(expectedTypeInfo, v.getTypeInfo());
+ }
+
+ private long microsSinceEpoch(Instant instant) {
+ return TimeUnit.SECONDS.toMicros(instant.getEpochSecond()) + instant.getNano() / 1000;
+ }
+
+ private String randomString(int len) {
+ StringBuilder sb = new StringBuilder(len);
+ for (int i = 0; i < len; i++) {
+ sb.append(RANDOM_CHARS.charAt(random.nextInt(RANDOM_CHARS.length())));
+ }
+ return sb.toString();
+ }
+
+ @Test
+ public void testNullJson() {
+ checkJson("null");
+ }
+
+ @Test
+ public void testBooleanJson() {
+ Arrays.asList("true", "false").forEach(this::checkJson);
+ }
+
+ @Test
+ public void testIntegerJson() {
+ Arrays.asList(
+ "0",
+ Byte.toString(Byte.MIN_VALUE), Byte.toString(Byte.MAX_VALUE),
+ Short.toString(Short.MIN_VALUE), Short.toString(Short.MAX_VALUE),
+ Integer.toString(Integer.MIN_VALUE), Integer.toString(Integer.MAX_VALUE),
+ Long.toString(Long.MIN_VALUE), Long.toString(Long.MAX_VALUE)
+ ).forEach(this::checkJson);
+ }
+
+ @Test
+ public void testFloatJson() {
+ Arrays.asList(
+ Float.toString(Float.MIN_VALUE), Float.toString(Float.MAX_VALUE),
+ Double.toString(Double.MIN_VALUE), Double.toString(Double.MAX_VALUE)
+ ).forEach(this::checkJson);
+ }
+
+ @Test
+ public void testStringJson() {
+ Arrays.asList(
+ "\"short string\"",
+ "\"long string: " + new String(new char[1000]).replace("\0", "x") + "\""
+ ).forEach(this::checkJson);
+ }
+
+ @Test
+ public void testDecimalJson() {
+ Arrays.asList(
+ "12.34", "-43.21",
+ "10.2147483647", "-1021474836.47",
+ "109223372036854775.807", "-109.223372036854775807"
+ ).forEach(this::checkJson);
+ }
+
+ @Test
+ public void testNullBuilder() {
+ VariantBuilder vb = new VariantBuilder(false);
+ vb.appendNull();
+ checkType(vb.result(), VariantUtil.NULL, 0);
+ }
+
+ @Test
+ public void testBooleanBuilder() {
+ Arrays.asList(true, false).forEach( b -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendBoolean(b);
+ checkType(vb2.result(), VariantUtil.PRIMITIVE, b ? VariantUtil.TRUE : VariantUtil.FALSE);
+ });
+ }
+
+ @Test
+ public void testIntegerBuilder() {
+ Arrays.asList(
+ 0L,
+ (long)Byte.MIN_VALUE, (long)Byte.MAX_VALUE,
+ (long)Short.MIN_VALUE, (long)Short.MAX_VALUE,
+ (long)Integer.MIN_VALUE, (long)Integer.MAX_VALUE,
+ Long.MIN_VALUE, Long.MAX_VALUE
+ ).forEach( l -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendLong(l);
+ Variant v = vb2.result();
+ if (Byte.MIN_VALUE <= l && l <= Byte.MAX_VALUE) {
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT1);
+ } else if (Short.MIN_VALUE <= l && l <= Short.MAX_VALUE) {
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT2);
+ } else if (Integer.MIN_VALUE <= l && l <= Integer.MAX_VALUE) {
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT4);
+ } else {
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.INT8);
+ }
+ Assert.assertEquals((long)l, v.getLong());
+ });
+ }
+
+ @Test
+ public void testFloatBuilder() {
+ Arrays.asList(Float.MIN_VALUE, Float.MAX_VALUE).forEach( f -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendFloat(f);
+ Variant v = vb2.result();
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.FLOAT);
+ Assert.assertEquals(f, v.getFloat(), 0.000001);
+ });
+ }
+
+ @Test
+ public void testDoubleBuilder() {
+ Arrays.asList(Double.MIN_VALUE, Double.MAX_VALUE).forEach( d -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendDouble(d);
+ Variant v = vb2.result();
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DOUBLE);
+ Assert.assertEquals(d, v.getDouble(), 0.000001);
+ });
+ }
+
+ @Test
+ public void testStringBuilder() {
+ IntStream.range(VariantUtil.MAX_SHORT_STR_SIZE - 3,
+ VariantUtil.MAX_SHORT_STR_SIZE + 3).forEach( len -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ String s = randomString(len);
+ vb2.appendString(s);
+ Variant v = vb2.result();
+ if (len <= VariantUtil.MAX_SHORT_STR_SIZE) {
+ checkType(v, VariantUtil.SHORT_STR, len);
+ } else {
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.LONG_STR);
+ }
+ Assert.assertEquals(s, v.getString());
+ });
+ }
+
+ @Test
+ public void testDecimalBuilder() {
+ // decimal4
+ Arrays.asList(new BigDecimal("123.456"), new BigDecimal("-987.654")).forEach( d -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendDecimal(d);
+ Variant v = vb2.result();
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL4);
+ Assert.assertEquals(d, v.getDecimal());
+ });
+
+ // decimal8
+ Arrays.asList(
+ new BigDecimal("10.2147483647"),
+ new BigDecimal("-1021474836.47")
+ ).forEach( d -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendDecimal(d);
+ Variant v = vb2.result();
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL8);
+ Assert.assertEquals(d, v.getDecimal());
+ });
+
+ // decimal16
+ Arrays.asList(
+ new BigDecimal("109223372036854775.807"),
+ new BigDecimal("-109.223372036854775807")
+ ).forEach( d -> {
+ VariantBuilder vb2 = new VariantBuilder(false);
+ vb2.appendDecimal(d);
+ Variant v = vb2.result();
+ checkType(v, VariantUtil.PRIMITIVE, VariantUtil.DECIMAL16);
+ Assert.assertEquals(d, v.getDecimal());
+ });
+ }
+
+ @Test
+ public void testDate() {
+ VariantBuilder vb = new VariantBuilder(false);
+ int days = Math.toIntExact(LocalDate.of(2024, 12, 16).toEpochDay());
+ vb.appendDate(days);
+ Assert.assertEquals("\"2024-12-16\"", vb.result().toJson(ZoneId.systemDefault()));
+ Assert.assertEquals(days, vb.result().getLong());
+ }
+
+ @Test
+ public void testTimestamp() {
+ VariantBuilder vb = new VariantBuilder(false);
+ long micros = microsSinceEpoch(Instant.parse("2024-12-16T10:23:45.321456-08:00"));
+ vb.appendTimestamp(micros);
+ Assert.assertEquals("\"2024-12-16T10:23:45.321456-08:00\"",
+ vb.result().toJson(ZoneId.of("-08:00")));
+ Assert.assertEquals("\"2024-12-16T19:23:45.321456+01:00\"",
+ vb.result().toJson(ZoneId.of("+01:00")));
+ Assert.assertEquals(micros, vb.result().getLong());
+ }
+
+ @Test
+ public void testTimestampNtz() {
+ DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME;
+ VariantBuilder vb = new VariantBuilder(false);
+ long micros = microsSinceEpoch(Instant.from(dtf.parse("2024-01-01T23:00:00.000001Z")));
+ vb.appendTimestampNtz(micros);
+ Assert.assertEquals("\"2024-01-01T23:00:00.000001\"",
+ vb.result().toJson(ZoneId.of("-08:00")));
+ Assert.assertEquals(vb.result().toJson(ZoneId.of("-08:00")),
+ vb.result().toJson(ZoneId.of("+02:00")));
+ Assert.assertEquals(micros, vb.result().getLong());
+ }
+
+ @Test
+ public void testBinary() {
+ VariantBuilder vb = new VariantBuilder(false);
+ byte[] binary = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ vb.appendBinary(binary);
+ Assert.assertEquals("\"" + Base64.getEncoder().encodeToString(binary) + "\"",
+ vb.result().toJson(ZoneId.systemDefault()));
+ Assert.assertArrayEquals(binary, vb.result().getBinary());
+ }
+
+ @Test
+ public void testObject() {
+ // simple object
+ StringBuilder sb = new StringBuilder();
+ sb.append("{");
+ for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) {
+ if (i > 0) sb.append(", ");
+ sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i));
+ }
+ sb.append("}");
+ checkJson(sb.toString());
+
+ // wide object
+ sb = new StringBuilder();
+ sb.append("{");
+ for (int i = 0; i < 50000; i++) {
+ if (i > 0) sb.append(", ");
+ sb.append("\"field" + i + "\": ")
+ .append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size()));
+ }
+ sb.append("}");
+ checkJson(sb.toString());
+
+ // deep object
+ sb = new StringBuilder();
+ // Jackson object mapper hit a stack overflow if json is too deep
+ for (int i = 0; i < 1000; i++) {
+ sb.append("{").append("\"field" + i + "\": ");
+ }
+ sb.append("{");
+ for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) {
+ if (i > 0) sb.append(", ");
+ sb.append("\"field" + i + "\": ").append(SAMPLE_JSON_VALUES.get(i));
+ }
+ sb.append("}");
+ for (int i = 0; i < 1000; i++) {
+ sb.append("}");
+ }
+ checkJson(sb.toString());
+ }
+
+ @Test
+ public void testArray() {
+ // simple array
+ StringBuilder sb = new StringBuilder();
+ sb.append("[");
+ for (int i = 0; i < SAMPLE_JSON_VALUES.size(); i++) {
+ if (i > 0) sb.append(", ");
+ sb.append(SAMPLE_JSON_VALUES.get(i));
+ }
+ sb.append("]");
+ checkJson(sb.toString());
+
+ // large array
+ sb = new StringBuilder();
+ sb.append("[");
+ for (int i = 0; i < 50000; i++) {
+ if (i > 0) sb.append(", ");
+ sb.append(SAMPLE_JSON_VALUES.get(i % SAMPLE_JSON_VALUES.size()));
+ }
+ sb.append("]");
+ checkJson(sb.toString());
+ }
+
+ @Test
+ public void testSizeLimit() {
+ // large metadata size
+ try {
+ VariantBuilder.parseJson(
+ "{\"12345678901234567890\": 1, \"123456789012345678901\": 2}",
+ new VariantBuilder(false, 20));
+ Assert.fail("Expected VariantSizeLimitException with large metadata");
+ } catch (IOException e) {
+ Assert.fail("Expected VariantSizeLimitException with large metadata");
+ } catch (VariantSizeLimitException e) {
+ // Expected
+ }
+
+ // large data size
+ try {
+ StringBuilder sb = new StringBuilder();
+ sb.append("[");
+ for (int i = 0; i < 100; i++) {
+ if (i > 0) sb.append(", ");
+ sb.append("{\"a\":1}");
+ }
+ sb.append("]");
+ VariantBuilder.parseJson(sb.toString(), new VariantBuilder(false, 20));
+ Assert.fail("Expected VariantSizeLimitException with large data");
+ } catch (IOException e) {
+ Assert.fail("Expected VariantSizeLimitException with large data");
+ } catch (VariantSizeLimitException e) {
+ // Expected
+ }
+ }
+
+ @Test
+ public void testAllowDuplicateKeys() {
+ // disallow duplicate keys
+ try {
+ VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}");
+ Assert.fail("Expected VariantDuplicateKeyException with duplicate keys");
+ } catch (IOException e) {
+ Assert.fail("Expected VariantDuplicateKeyException with duplicate keys");
+ } catch (VariantDuplicateKeyException e) {
+ // Expected
+ }
+
+ // allow duplicate keys
+ try {
+ Variant v = VariantBuilder.parseJson("{\"a\": 1, \"a\": 2}",
+ new VariantBuilder(true, VariantUtil.DEFAULT_SIZE_LIMIT));
+ Assert.assertEquals(1, v.objectSize());
+ Assert.assertEquals(VariantUtil.Type.LONG, v.getFieldByKey("a").getType());
+ Assert.assertEquals(2, v.getFieldByKey("a").getLong());
+ } catch (Exception e) {
+ Assert.fail("Unexpected exception: " + e);
+ }
+ }
+
+ @Test
+ public void testTruncateTrailingZeroDecimal() {
+ for (String[] strings : Arrays.asList(
+ // decimal4
+ // truncate all trailing zeros
+ new String[]{"1234.0000", "1234"},
+ // truncate some trailing zeros
+ new String[]{"1234.5600", "1234.56"},
+ // truncate no trailing zeros
+ new String[]{"1234.5678", "1234.5678"},
+ // decimal8
+ // truncate all trailing zeros
+ new String[]{"-10.0000000000", "-10"},
+ // truncate some trailing zeros
+ new String[]{"-10.2147000000", "-10.2147"},
+ // truncate no trailing zeros
+ new String[]{"-10.2147483647", "-10.2147483647"},
+ // decimal16
+ // truncate all trailing zeros
+ new String[]{"1092233720368547.00000", "1092233720368547"},
+ // truncate some trailing zeros
+ new String[]{"1092233720368547.75800", "1092233720368547.758"},
+ // truncate no trailing zeros
+ new String[]{"1092233720368547.75807", "1092233720368547.75807"})) {
+ VariantBuilder vb = new VariantBuilder(false);
+ BigDecimal d = new BigDecimal(strings[0]);
+ vb.appendDecimal(d);
+ Variant v = vb.result();
+ Assert.assertEquals(strings[0], v.toJson(ZoneId.of("-08:00")));
+ Assert.assertEquals(strings[1], v.toJson(ZoneId.of("-08:00"), true));
+ }
+ }
+
+ @Test
+ public void testTruncateTrailingZeroTimestamp() {
+ for (String[] strings : Arrays.asList(
+ // truncate all trailing zeros
+ new String[] {"2024-12-16T10:23:45.000000-08:00", "2024-12-16T10:23:45-08:00"},
+ // truncate all trailing zeros
+ new String[] {"2024-12-16T10:23:45.123000-08:00", "2024-12-16T10:23:45.123-08:00"},
+ // truncate no trailing zeros
+ new String[] {"2024-12-16T10:23:45.123456-08:00", "2024-12-16T10:23:45.123456-08:00"})) {
+ VariantBuilder vb = new VariantBuilder(false);
+ long micros = microsSinceEpoch(Instant.parse(strings[0]));
+ vb.appendTimestamp(micros);
+ Variant v = vb.result();
+ Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00")));
+ Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true));
+ }
+ }
+
+ @Test
+ public void testTruncateTrailingZeroTimestampNtz() {
+ DateTimeFormatter dtf = DateTimeFormatter.ISO_DATE_TIME;
+ for (String[] strings : Arrays.asList(
+ // truncate all trailing zeros
+ new String[] {"2024-12-16T10:23:45.000000", "2024-12-16T10:23:45"},
+ // truncate all trailing zeros
+ new String[] {"2024-12-16T10:23:45.123000", "2024-12-16T10:23:45.123"},
+ // truncate no trailing zeros
+ new String[] {"2024-12-16T10:23:45.123456", "2024-12-16T10:23:45.123456"})) {
+ VariantBuilder vb = new VariantBuilder(false);
+
+ long micros = microsSinceEpoch(Instant.from(dtf.parse(String.format("%sZ", strings[0]))));
+ vb.appendTimestampNtz(micros);
+ Variant v = vb.result();
+ Assert.assertEquals(String.format("\"%s\"", strings[0]), v.toJson(ZoneId.of("-08:00")));
+ Assert.assertEquals(String.format("\"%s\"", strings[1]), v.toJson(ZoneId.of("-08:00"), true));
+ Assert.assertEquals(micros, vb.result().getLong());
+ }
+ }
+}
diff --git a/pom.xml b/pom.xml
index 2496171867..5f49bf1764 100644
--- a/pom.xml
+++ b/pom.xml
@@ -165,6 +165,7 @@
parquet-protobuf
parquet-thrift
parquet-hadoop-bundle
+ parquet-variant