From 911b0d8e4e08cbf40165515eef081b4078fa9804 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 29 May 2026 03:24:09 +0200 Subject: [PATCH 1/5] initial commit --- cpp/src/generated/parquet_types.cpp | 50 +- cpp/src/generated/parquet_types.h | 26 +- cpp/src/generated/parquet_types.tcc | 13 + .../parquet/arrow/arrow_reader_writer_test.cc | 784 +++++++++++++++++- cpp/src/parquet/arrow/arrow_schema_test.cc | 294 ++++++- cpp/src/parquet/arrow/path_internal.cc | 234 +++++- cpp/src/parquet/arrow/path_internal.h | 17 +- cpp/src/parquet/arrow/path_internal_test.cc | 82 ++ cpp/src/parquet/arrow/reader.cc | 209 ++++- cpp/src/parquet/arrow/schema.cc | 352 +++++++- cpp/src/parquet/arrow/schema.h | 5 + cpp/src/parquet/arrow/schema_internal.cc | 13 + cpp/src/parquet/arrow/schema_internal.h | 2 + cpp/src/parquet/arrow/writer.cc | 138 ++- cpp/src/parquet/column_writer.cc | 75 +- cpp/src/parquet/parquet.thrift | 21 + cpp/src/parquet/properties.h | 18 +- cpp/src/parquet/schema.cc | 129 ++- cpp/src/parquet/schema.h | 63 +- cpp/src/parquet/schema_test.cc | 101 ++- cpp/src/parquet/types.cc | 1 + cpp/src/parquet/types.h | 9 +- 22 files changed, 2533 insertions(+), 103 deletions(-) diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp index 0ee973f2a2d6..3f8c9899a608 100644 --- a/cpp/src/generated/parquet_types.cpp +++ b/cpp/src/generated/parquet_types.cpp @@ -335,7 +335,20 @@ int _kFieldRepetitionTypeValues[] = { /** * The field is repeated and can contain 0 or more values */ - FieldRepetitionType::REPEATED + FieldRepetitionType::REPEATED, + /** + * Experimental Option B prototype for fixed-size vector/list values. + * + * This field repeats vector_length times per parent value without increasing the + * maximum definition or repetition level of its descendants. VECTOR-aware + * readers use the fixed multiplicity from the schema to group the physical + * child values back into one logical vector value. Vector-level nullability is + * represented by an OPTIONAL parent node. + * + * Readers that do not understand VECTOR are expected to reject the file rather + * than attempting a LIST fallback. + */ + FieldRepetitionType::VECTOR }; const char* _kFieldRepetitionTypeNames[] = { /** @@ -349,9 +362,22 @@ const char* _kFieldRepetitionTypeNames[] = { /** * The field is repeated and can contain 0 or more values */ - "REPEATED" + "REPEATED", + /** + * Experimental Option B prototype for fixed-size vector/list values. + * + * This field repeats vector_length times per parent value without increasing the + * maximum definition or repetition level of its descendants. VECTOR-aware + * readers use the fixed multiplicity from the schema to group the physical + * child values back into one logical vector value. Vector-level nullability is + * represented by an OPTIONAL parent node. + * + * Readers that do not understand VECTOR are expected to reject the file rather + * than attempting a LIST fallback. + */ + "VECTOR" }; -const std::map _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, nullptr, nullptr)); +const std::map _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(4, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, nullptr, nullptr)); std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) { std::map::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val); @@ -2565,7 +2591,8 @@ SchemaElement::SchemaElement() noexcept converted_type(static_cast(0)), scale(0), precision(0), - field_id(0) { + field_id(0), + vector_length(0) { } void SchemaElement::__set_type(const Type::type val) { @@ -2616,6 +2643,11 @@ void SchemaElement::__set_logicalType(const LogicalType& val) { this->logicalType = val; __isset.logicalType = true; } + +void SchemaElement::__set_vector_length(const int32_t val) { + this->vector_length = val; +__isset.vector_length = true; +} std::ostream& operator<<(std::ostream& out, const SchemaElement& obj) { obj.printTo(out); @@ -2635,6 +2667,7 @@ void swap(SchemaElement &a, SchemaElement &b) { swap(a.precision, b.precision); swap(a.field_id, b.field_id); swap(a.logicalType, b.logicalType); + swap(a.vector_length, b.vector_length); swap(a.__isset, b.__isset); } @@ -2678,6 +2711,10 @@ bool SchemaElement::operator==(const SchemaElement & rhs) const return false; else if (__isset.logicalType && !(logicalType == rhs.logicalType)) return false; + if (__isset.vector_length != rhs.__isset.vector_length) + return false; + else if (__isset.vector_length && !(vector_length == rhs.vector_length)) + return false; return true; } @@ -2692,6 +2729,7 @@ SchemaElement::SchemaElement(const SchemaElement& other126) { precision = other126.precision; field_id = other126.field_id; logicalType = other126.logicalType; + vector_length = other126.vector_length; __isset = other126.__isset; } SchemaElement::SchemaElement(SchemaElement&& other127) noexcept { @@ -2705,6 +2743,7 @@ SchemaElement::SchemaElement(SchemaElement&& other127) noexcept { precision = other127.precision; field_id = other127.field_id; logicalType = std::move(other127.logicalType); + vector_length = other127.vector_length; __isset = other127.__isset; } SchemaElement& SchemaElement::operator=(const SchemaElement& other128) { @@ -2718,6 +2757,7 @@ SchemaElement& SchemaElement::operator=(const SchemaElement& other128) { precision = other128.precision; field_id = other128.field_id; logicalType = other128.logicalType; + vector_length = other128.vector_length; __isset = other128.__isset; return *this; } @@ -2732,6 +2772,7 @@ SchemaElement& SchemaElement::operator=(SchemaElement&& other129) noexcept { precision = other129.precision; field_id = other129.field_id; logicalType = std::move(other129.logicalType); + vector_length = other129.vector_length; __isset = other129.__isset; return *this; } @@ -2748,6 +2789,7 @@ void SchemaElement::printTo(std::ostream& out) const { out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "")); out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "")); out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "")); + out << ", " << "vector_length="; (__isset.vector_length ? (out << to_string(vector_length)) : (out << "")); out << ")"; } diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h index 1f1e254f5cf2..0bd7544114d7 100644 --- a/cpp/src/generated/parquet_types.h +++ b/cpp/src/generated/parquet_types.h @@ -202,7 +202,20 @@ struct FieldRepetitionType { /** * The field is repeated and can contain 0 or more values */ - REPEATED = 2 + REPEATED = 2, + /** + * Experimental Option B prototype for fixed-size vector/list values. + * + * This field repeats vector_length times per parent value without increasing the + * maximum definition or repetition level of its descendants. VECTOR-aware + * readers use the fixed multiplicity from the schema to group the physical + * child values back into one logical vector value. Vector-level nullability is + * represented by an OPTIONAL parent node. + * + * Readers that do not understand VECTOR are expected to reject the file rather + * than attempting a LIST fallback. + */ + VECTOR = 3 }; }; @@ -1732,7 +1745,7 @@ void swap(LogicalType &a, LogicalType &b); std::ostream& operator<<(std::ostream& out, const LogicalType& obj); typedef struct _SchemaElement__isset { - _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false) {} + _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false), vector_length(false) {} bool type :1; bool type_length :1; bool repetition_type :1; @@ -1742,6 +1755,7 @@ typedef struct _SchemaElement__isset { bool precision :1; bool field_id :1; bool logicalType :1; + bool vector_length :1; } _SchemaElement__isset; /** @@ -1820,6 +1834,12 @@ class SchemaElement { * for some logical types to ensure forward-compatibility in format v1. */ LogicalType logicalType; + /** + * Required when repetition_type is VECTOR. Must be positive. For nested VECTOR + * nodes, the effective number of physical leaf values per parent is the product + * of vector_length along the leaf's VECTOR ancestors. + */ + int32_t vector_length; _SchemaElement__isset __isset; @@ -1843,6 +1863,8 @@ class SchemaElement { void __set_logicalType(const LogicalType& val); + void __set_vector_length(const int32_t val); + bool operator == (const SchemaElement & rhs) const; bool operator != (const SchemaElement &rhs) const { return !(*this == rhs); diff --git a/cpp/src/generated/parquet_types.tcc b/cpp/src/generated/parquet_types.tcc index 78e3e2549394..710162fe76d6 100644 --- a/cpp/src/generated/parquet_types.tcc +++ b/cpp/src/generated/parquet_types.tcc @@ -2000,6 +2000,14 @@ uint32_t SchemaElement::read(Protocol_* iprot) { xfer += iprot->skip(ftype); } break; + case 12: + if (ftype == ::apache::thrift::protocol::T_I32) { + xfer += iprot->readI32(this->vector_length); + this->__isset.vector_length = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -2069,6 +2077,11 @@ uint32_t SchemaElement::write(Protocol_* oprot) const { xfer += this->logicalType.write(oprot); xfer += oprot->writeFieldEnd(); } + if (this->__isset.vector_length) { + xfer += oprot->writeFieldBegin("vector_length", ::apache::thrift::protocol::T_I32, 12); + xfer += oprot->writeI32(this->vector_length); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index d29458bf226b..62044274ad63 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -24,10 +24,13 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#include #include #include #include #include +#include +#include #include #include @@ -421,6 +424,24 @@ void WriteTableToBuffer(const std::shared_ptr& table, int64_t row_group_s *out, WriteTableToBuffer(table, row_group_size, write_props, arrow_properties)); } +std::shared_ptr VectorWriterProperties() { + auto builder = WriterProperties::Builder(); + return builder.disable_dictionary() + ->disable_statistics() + ->disable_write_page_index() + ->encoding(Encoding::PLAIN) + ->build(); +} + +std::shared_ptr VectorByteStreamSplitWriterProperties() { + auto builder = WriterProperties::Builder(); + return builder.disable_dictionary() + ->disable_statistics() + ->disable_write_page_index() + ->encoding(Encoding::BYTE_STREAM_SPLIT) + ->build(); +} + void DoRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, std::shared_ptr
* out, const std::shared_ptr<::parquet::WriterProperties>& writer_properties = @@ -442,6 +463,62 @@ void DoRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, ASSERT_OK_AND_ASSIGN(*out, reader->ReadTable()); } +std::shared_ptr<::arrow::DataType> MakeVectorFixedSizeListType( + const std::shared_ptr<::arrow::DataType>& item_type, bool element_nullable = false) { + return ::arrow::fixed_size_list(::arrow::field("item", item_type, element_nullable), + /*size=*/3); +} + +std::shared_ptr<::arrow::DataType> VectorFixedSizeListType( + bool element_nullable = false) { + return MakeVectorFixedSizeListType(::arrow::int16(), element_nullable); +} + +std::shared_ptr<::arrow::DataType> VectorFloatFixedSizeListType( + bool element_nullable = false) { + return MakeVectorFixedSizeListType(::arrow::float32(), element_nullable); +} + +std::shared_ptr
MakeVectorFixedSizeListTable( + const std::vector>& chunks, bool nullable = true, + bool element_nullable = false) { + auto type = VectorFixedSizeListType(element_nullable); + auto field = ::arrow::field("root", type, nullable); + auto column = std::make_shared(chunks, type); + return ::arrow::Table::Make(::arrow::schema({field}), {column}); +} + +std::shared_ptr
MakeVectorFixedSizeListTable(std::string_view json, + bool nullable = true, + bool element_nullable = false) { + auto type = VectorFixedSizeListType(element_nullable); + return MakeVectorFixedSizeListTable({::arrow::ArrayFromJSON(type, std::string(json))}, + nullable, element_nullable); +} + +std::shared_ptr
MakeVectorFixedSizeListTable( + const std::shared_ptr<::arrow::DataType>& item_type, std::string_view json, + bool nullable = true, bool element_nullable = false) { + auto type = MakeVectorFixedSizeListType(item_type, element_nullable); + auto field = ::arrow::field("root", type, nullable); + auto array = ::arrow::ArrayFromJSON(type, std::string(json)); + return ::arrow::Table::Make(::arrow::schema({field}), {array}); +} + +void CheckVectorFixedSizeListRoundtrip( + const std::shared_ptr
& table, int64_t row_group_size, + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties()) { + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoRoundtrip(table, row_group_size, &result, + VectorWriterProperties(), builder.build(), + arrow_reader_properties)); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), false); + ::arrow::AssertTablesEqual(*table, *result, false); +} + void CheckConfiguredRoundtrip( const std::shared_ptr
& input_table, const std::shared_ptr
& expected_table = nullptr, @@ -3341,6 +3418,710 @@ TEST(ArrowReadWrite, FixedSizeList) { CheckSimpleRoundtrip(table, 2, props_store_schema); } +TEST(ArrowWriteOnly, FixedSizeListVectorSchemaRequired) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9]])", + /*nullable=*/false); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/2, + VectorWriterProperties(), builder.build())); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + const auto* schema = reader->metadata()->schema(); + const auto* root = schema->group_node()->field(0).get(); + ASSERT_TRUE(root->is_group()); + const auto* root_group = static_cast(root); + ASSERT_EQ(root_group->repetition(), Repetition::REQUIRED); + ASSERT_TRUE(root_group->logical_type() == nullptr || + root_group->logical_type()->is_none()); + ASSERT_EQ(root_group->field_count(), 1); + ASSERT_TRUE(root_group->field(0)->is_vector()); + ASSERT_TRUE(root_group->field(0)->is_primitive()); + ASSERT_EQ(root_group->field(0)->vector_length(), 3); + ASSERT_EQ(reader->metadata()->RowGroup(0)->ColumnChunk(0)->num_values(), 6); + ASSERT_EQ(reader->metadata()->RowGroup(1)->ColumnChunk(0)->num_values(), 3); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorSchemaNullable) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + null, + [7, 8, 9]])"); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/3, + VectorWriterProperties(), builder.build())); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + const auto* schema = reader->metadata()->schema(); + const auto* root = schema->group_node()->field(0).get(); + ASSERT_TRUE(root->is_group()); + const auto* root_group = static_cast(root); + ASSERT_EQ(root_group->repetition(), Repetition::OPTIONAL); + ASSERT_TRUE(root_group->logical_type() == nullptr || + root_group->logical_type()->is_none()); + ASSERT_EQ(root_group->field_count(), 1); + ASSERT_TRUE(root_group->field(0)->is_vector()); + ASSERT_TRUE(root_group->field(0)->is_primitive()); + ASSERT_EQ(root_group->field(0)->vector_length(), 3); + // Option B uses fixed-stride nullable parent layout: every parent row, + // including a null vector row, contributes vector_length child slots. + ASSERT_EQ(reader->metadata()->RowGroup(0)->ColumnChunk(0)->num_values(), 9); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorNullRowsEmitFixedWidthNullSlots) { + auto table = MakeVectorFixedSizeListTable(R"([ + null, + null, + null])"); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/3, + VectorWriterProperties(), builder.build())); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + ASSERT_EQ(reader->metadata()->RowGroup(0)->num_rows(), 3); + ASSERT_EQ(reader->metadata()->RowGroup(0)->ColumnChunk(0)->num_values(), 9); +} + +TEST(ArrowReadWrite, FixedSizeListVectorRequiredRoundTrip) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9]])", + /*nullable=*/false); + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTrip) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + null, + [7, 8, 9]])"); + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/3)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripNullPatterns) { + const std::vector cases = { + R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", + R"([null, [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], [4, 5, 6], null])", + R"([[1, 2, 3], null, null, [10, 11, 12], null])", + R"([null, null, null])", + R"([])"}; + + for (const auto& json : cases) { + SCOPED_TRACE(json); + auto table = MakeVectorFixedSizeListTable(json); + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip( + table, std::max(1, table->num_rows()))); + } +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripRowGroups) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + null, + [7, 8, 9], + null, + [13, 14, 15], + [16, 17, 18]])"); + + ArrowReaderProperties reader_properties; + reader_properties.set_batch_size(1); + ASSERT_NO_FATAL_FAILURE( + CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2, reader_properties)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorStructOfVectorsRoundTrip) { + auto ids_type = ::arrow::fixed_size_list( + ::arrow::field("value", ::arrow::int32(), /*nullable=*/true), 2); + auto scores_type = ::arrow::fixed_size_list(::arrow::float32(), 3); + auto type = ::arrow::struct_({::arrow::field("ids", ids_type, /*nullable=*/true), + ::arrow::field("scores", scores_type, + /*nullable=*/false)}); + auto array = ::arrow::ArrayFromJSON(type, R"([ + {"ids": [10, null], "scores": [0.1, 0.2, 0.3]}, + {"ids": null, "scores": [1.1, 1.2, 1.3]}, + {"ids": [30, 40], "scores": [2.1, 2.2, 2.3]}])"); + auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), + {array}); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorListOfVectorsRoundTrip) { + auto vector_type = ::arrow::fixed_size_list(::arrow::int32(), 2); + auto type = ::arrow::list(::arrow::field("item", vector_type, /*nullable=*/false)); + auto array = ::arrow::ArrayFromJSON(type, R"([ + [[1, 2], [3, 4]], + [[5, 6]], + [[7, 8], [9, 10], [11, 12]]])"); + auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), + {array}); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorListOfStructsWithVectorsRoundTrip) { + auto vector_type = ::arrow::fixed_size_list(::arrow::float32(), 3); + auto struct_type = ::arrow::struct_( + {::arrow::field("embedding", vector_type, /*nullable=*/false)}); + auto type = ::arrow::list(::arrow::field("item", struct_type, /*nullable=*/false)); + auto array = ::arrow::ArrayFromJSON(type, R"([ + [{"embedding": [1.0, 2.0, 3.0]}, {"embedding": [4.0, 5.0, 6.0]}], + [{"embedding": [7.0, 8.0, 9.0]}], + [{"embedding": [10.0, 11.0, 12.0]}, {"embedding": [13.0, 14.0, 15.0]}]])"); + auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), + {array}); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNestedVectorsRoundTrip) { + auto inner_type = ::arrow::fixed_size_list(::arrow::uint8(), 2); + auto type = ::arrow::fixed_size_list( + ::arrow::field("item", inner_type, /*nullable=*/false), 2); + auto array = ::arrow::ArrayFromJSON(type, R"([ + [[1, 2], [3, 4]], + null, + [[5, 6], [7, 8]]])"); + auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), + {array}); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableChunkedReadAcrossRowGroups) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + null, + [7, 8, 9], + null, + [13, 14, 15]])"); + + ArrowWriterProperties::Builder writer_builder; + writer_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN( + auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, + VectorWriterProperties(), writer_builder.build())); + + ArrowReaderProperties reader_properties; + reader_properties.set_batch_size(2); + FileReaderBuilder reader_builder; + ASSERT_OK(reader_builder.Open(std::make_shared(buffer))); + reader_builder.properties(reader_properties); + std::unique_ptr reader; + ASSERT_OK(reader_builder.Build(&reader)); + ASSERT_OK_AND_ASSIGN(auto rb_reader, reader->GetRecordBatchReader()); + ASSERT_OK_AND_ASSIGN(auto out, Table::FromRecordBatchReader(rb_reader.get())); + + ASSERT_EQ(out->column(0)->num_chunks(), 3); + ASSERT_EQ(out->column(0)->chunk(0)->length(), 2); + ASSERT_EQ(out->column(0)->chunk(1)->length(), 2); + ASSERT_EQ(out->column(0)->chunk(2)->length(), 1); + ::arrow::AssertTablesEqual(*table, *out, false); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripSlicedInput) { + auto base = ::arrow::ArrayFromJSON(VectorFixedSizeListType(), R"([ + [100, 101, 102], + [1, 2, 3], + null, + [7, 8, 9], + null, + [200, 201, 202]])"); + auto table = MakeVectorFixedSizeListTable({base->Slice(/*offset=*/1, /*length=*/4)}); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorRequiredRowsNullableElementsRoundTrip) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, null, 3], + [null, 5, 6], + [7, 8, null]])", + /*nullable=*/false, + /*element_nullable=*/true); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableRowsAndElementsRoundTrip) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, null, 3], + null, + [null, null, null], + [7, 8, null]])", + /*nullable=*/true, + /*element_nullable=*/true); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/4)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNestedRoundTrip) { + auto inner_type = ::arrow::fixed_size_list( + ::arrow::field("item", ::arrow::int16(), /*nullable=*/true), /*list_size=*/2); + auto outer_type = ::arrow::fixed_size_list( + ::arrow::field("item", inner_type, /*nullable=*/true), /*list_size=*/3); + auto table = ::arrow::Table::Make( + ::arrow::schema({::arrow::field("root", outer_type, /*nullable=*/true)}), + {::arrow::ArrayFromJSON(outer_type, R"([ + [[1, 2], null, [5, null]], + null, + [[7, 8], [9, 10], [null, 12]]])")}); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +struct VectorPrimitiveRoundTripCase { + std::string name; + std::shared_ptr<::arrow::DataType> item_type; + std::shared_ptr<::arrow::DataType> expected_item_type; + std::string required_json; + std::string nullable_rows_json; + std::string nullable_rows_and_elements_json; +}; + +class VectorPrimitiveRoundTripTest + : public ::testing::TestWithParam {}; + +TEST_P(VectorPrimitiveRoundTripTest, RequiredRows) { + const auto& param = GetParam(); + auto table = MakeVectorFixedSizeListTable(param.item_type, param.required_json, + /*nullable=*/false, + /*element_nullable=*/false); + auto expected = + param.expected_item_type + ? MakeVectorFixedSizeListTable(param.expected_item_type, param.required_json, + /*nullable=*/false, + /*element_nullable=*/false) + : nullptr; + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip( + table, expected, VectorWriterProperties(), builder.build())); +} + +TEST_P(VectorPrimitiveRoundTripTest, NullableRows) { + const auto& param = GetParam(); + auto table = MakeVectorFixedSizeListTable(param.item_type, param.nullable_rows_json, + /*nullable=*/true, + /*element_nullable=*/false); + auto expected = param.expected_item_type + ? MakeVectorFixedSizeListTable(param.expected_item_type, + param.nullable_rows_json, + /*nullable=*/true, + /*element_nullable=*/false) + : nullptr; + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip( + table, expected, VectorWriterProperties(), builder.build())); +} + +TEST_P(VectorPrimitiveRoundTripTest, NullableRowsAndElements) { + const auto& param = GetParam(); + auto table = + MakeVectorFixedSizeListTable(param.item_type, param.nullable_rows_and_elements_json, + /*nullable=*/true, + /*element_nullable=*/true); + auto expected = param.expected_item_type ? MakeVectorFixedSizeListTable( + param.expected_item_type, + param.nullable_rows_and_elements_json, + /*nullable=*/true, + /*element_nullable=*/true) + : nullptr; + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip( + table, expected, VectorWriterProperties(), builder.build())); +} + +INSTANTIATE_TEST_SUITE_P( + PrimitiveElements, VectorPrimitiveRoundTripTest, + ::testing::Values( + VectorPrimitiveRoundTripCase{ + "Boolean", ::arrow::boolean(), nullptr, + R"([[true, false, true], [false, true, false], [true, true, false]])", + R"([[true, false, true], null, [false, true, false]])", + R"([[true, null, true], null, [null, true, false]])"}, + VectorPrimitiveRoundTripCase{ + "UInt8", ::arrow::uint8(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{"Int8", ::arrow::int8(), nullptr, + R"([[1, -2, 3], [4, -5, 6], [7, -8, 9]])", + R"([[1, -2, 3], null, [7, -8, 9]])", + R"([[1, null, 3], null, [null, -8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "UInt16", ::arrow::uint16(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{"Int16", ::arrow::int16(), nullptr, + R"([[1, -2, 3], [4, -5, 6], [7, -8, 9]])", + R"([[1, -2, 3], null, [7, -8, 9]])", + R"([[1, null, 3], null, [null, -8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "UInt32", ::arrow::uint32(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "Int32", ::arrow::int32(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "UInt64", ::arrow::uint64(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "Int64", ::arrow::int64(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "Float16", ::arrow::float16(), nullptr, + R"([[1.0, 2.0, 3.0], [4.5, 5.5, 6.5], [7.25, 8.25, 9.25]])", + R"([[1.0, 2.0, 3.0], null, [7.25, 8.25, 9.25]])", + R"([[1.0, null, 3.0], null, [null, 8.25, 9.25]])"}, + VectorPrimitiveRoundTripCase{ + "Float32", ::arrow::float32(), nullptr, + R"([[1.0, 2.0, 3.0], [4.5, 5.5, 6.5], [7.25, 8.25, 9.25]])", + R"([[1.0, 2.0, 3.0], null, [7.25, 8.25, 9.25]])", + R"([[1.0, null, 3.0], null, [null, 8.25, 9.25]])"}, + VectorPrimitiveRoundTripCase{ + "Float64", ::arrow::float64(), nullptr, + R"([[1.0, 2.0, 3.0], [4.5, 5.5, 6.5], [7.25, 8.25, 9.25]])", + R"([[1.0, 2.0, 3.0], null, [7.25, 8.25, 9.25]])", + R"([[1.0, null, 3.0], null, [null, 8.25, 9.25]])"}, + VectorPrimitiveRoundTripCase{ + "Date32", ::arrow::date32(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "Time32Milli", ::arrow::time32(::arrow::TimeUnit::MILLI), nullptr, + R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", R"([[1, 2, 3], null, [7, 8, 9]])", + R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "Time64Micro", ::arrow::time64(::arrow::TimeUnit::MICRO), nullptr, + R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", R"([[1, 2, 3], null, [7, 8, 9]])", + R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "TimestampMilli", ::arrow::timestamp(::arrow::TimeUnit::MILLI), nullptr, + R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", R"([[1, 2, 3], null, [7, 8, 9]])", + R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "DurationMilli", ::arrow::duration(::arrow::TimeUnit::MILLI), + ::arrow::int64(), R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", + R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, + VectorPrimitiveRoundTripCase{ + "FixedSizeBinary", ::arrow::fixed_size_binary(4), nullptr, + R"([["ab01", "cd02", "ef03"], ["gh04", "ij05", "kl06"], ["mn07", "op08", "qr09"]])", + R"([["ab01", "cd02", "ef03"], null, ["mn07", "op08", "qr09"]])", + R"([["ab01", null, "ef03"], null, [null, "op08", "qr09"]])"}, + VectorPrimitiveRoundTripCase{ + "Decimal32", ::arrow::decimal32(6, 3), ::arrow::decimal128(6, 3), + R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", + R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", + R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}, + VectorPrimitiveRoundTripCase{ + "Decimal64", ::arrow::decimal64(16, 3), ::arrow::decimal128(16, 3), + R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", + R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", + R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}, + VectorPrimitiveRoundTripCase{ + "Decimal128", ::arrow::decimal128(20, 3), nullptr, + R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", + R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", + R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}, + VectorPrimitiveRoundTripCase{ + "Decimal256", ::arrow::decimal256(40, 3), nullptr, + R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", + R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", + R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +TEST(ArrowReadWrite, FixedSizeListVectorMixedColumnsRoundTripAcrossRowGroups) { + auto vector_type = + ::arrow::fixed_size_list(::arrow::field("item", ::arrow::int16(), true), + /*list_size=*/3); + auto list_type = ::arrow::list(::arrow::field("item", ::arrow::int32(), true)); + + auto vector_chunks = std::vector>{ + ::arrow::ArrayFromJSON(vector_type, + R"([[1, null, 3], null, [7, 8, null], [10, 11, 12]])"), + ::arrow::ArrayFromJSON(vector_type, + R"([null, [16, null, 18], [19, 20, 21], [22, null, 24]])")}; + auto table = ::arrow::Table::Make( + ::arrow::schema({ + ::arrow::field("id", ::arrow::int32(), false), + ::arrow::field("embedding", vector_type, true), + ::arrow::field("label", ::arrow::utf8(), true), + ::arrow::field("tags", list_type, true), + }), + { + std::make_shared( + ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 1, 2, 3, 4, 5, 6, 7]")), + std::make_shared(std::move(vector_chunks), vector_type), + std::make_shared(::arrow::ArrayFromJSON( + ::arrow::utf8(), R"(["a", null, "c", "d", null, "f", "g", "h"])")), + std::make_shared(::arrow::ArrayFromJSON( + list_type, R"([[1, 2], [], null, [3, null], [4], [5, 6], null, []])")), + }); + + ArrowWriterProperties::Builder writer_builder; + writer_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN( + auto buffer, WriteTableToBuffer(table, /*row_group_size=*/3, + VectorWriterProperties(), writer_builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + ASSERT_EQ(parquet_reader->metadata()->num_row_groups(), 3); + for (int i = 0; i < 3; ++i) { + const int64_t expected_rows = i == 2 ? 2 : 3; + EXPECT_EQ(parquet_reader->metadata()->RowGroup(i)->num_rows(), expected_rows); + EXPECT_EQ(parquet_reader->metadata()->RowGroup(i)->ColumnChunk(1)->num_values(), + expected_rows * 3); + } + + ArrowReaderProperties reader_properties; + reader_properties.set_batch_size(2); + FileReaderBuilder reader_builder; + ASSERT_OK(reader_builder.Open(std::make_shared(buffer))); + reader_builder.properties(reader_properties); + std::unique_ptr reader; + ASSERT_OK(reader_builder.Build(&reader)); + ASSERT_OK_AND_ASSIGN(auto rb_reader, reader->GetRecordBatchReader()); + ASSERT_OK_AND_ASSIGN(auto out, Table::FromRecordBatchReader(rb_reader.get())); + + ::arrow::AssertSchemaEqual(*table->schema(), *out->schema(), false); + ::arrow::AssertTablesEqual(*table, *out, false); +} + +TEST(ArrowReadWrite, FixedSizeListVectorMixedColumnsRoundtripEqualsOriginal) { + constexpr int32_t kVectorSize = 4; + auto vector_type = ::arrow::fixed_size_list( + ::arrow::field("item", ::arrow::float32(), false), kVectorSize); + auto list_type = ::arrow::list(::arrow::field("item", ::arrow::int32(), false)); + + auto table = ::arrow::Table::Make( + ::arrow::schema({ + ::arrow::field("id", ::arrow::int32(), false), + ::arrow::field("embedding", vector_type, false), + ::arrow::field("label", ::arrow::utf8(), false), + ::arrow::field("tags", list_type, false), + }), + { + ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 1, 2, 3, 4]"), + ::arrow::ArrayFromJSON(vector_type, R"([ + [0.0, 0.1, 0.2, 0.3], + [1.0, 1.1, 1.2, 1.3], + [2.0, 2.1, 2.2, 2.3], + [3.0, 3.1, 3.2, 3.3], + [4.0, 4.1, 4.2, 4.3] + ])"), + ::arrow::ArrayFromJSON(::arrow::utf8(), + R"(["alpha", "bravo", "charlie", "delta", "echo"])"), + ::arrow::ArrayFromJSON(list_type, + R"([[10, 11], [], [12], [13, 14, 15], [16, 17]])"), + }); + + ArrowWriterProperties::Builder writer_builder; + writer_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN( + auto buffer, WriteTableToBuffer(table, /*row_group_size=*/table->num_rows(), + VectorWriterProperties(), writer_builder.build())); + + std::unique_ptr reader; + FileReaderBuilder reader_builder; + ASSERT_OK(reader_builder.Open(std::make_shared(buffer))); + ASSERT_OK(reader_builder.Build(&reader)); + std::shared_ptr
out; + ASSERT_OK(reader->ReadTable(&out)); + + ::arrow::AssertSchemaEqual(*table->schema(), *out->schema(), false); + ::arrow::AssertTablesEqual(*table, *out, false); +} + +std::shared_ptr<::arrow::DataType> VectorFixedSizeListStructType( + bool element_nullable = false, bool field_nullable = false) { + return ::arrow::fixed_size_list( + ::arrow::field( + "item", + ::arrow::struct_({::arrow::field("x", ::arrow::float32(), false), + ::arrow::field("y", ::arrow::int32(), field_nullable)}), + element_nullable), + /*size=*/2); +} + +std::shared_ptr
MakeVectorFixedSizeListStructTable(std::string_view json, + bool nullable = true, + bool element_nullable = false, + bool field_nullable = false) { + auto type = VectorFixedSizeListStructType(element_nullable, field_nullable); + auto field = ::arrow::field("root", type, nullable); + auto array = ::arrow::ArrayFromJSON(type, std::string(json)); + return ::arrow::Table::Make(::arrow::schema({field}), {array}); +} + +std::shared_ptr<::arrow::DataType> VectorFixedSizeListNestedStructType( + bool element_nullable = false) { + return ::arrow::fixed_size_list( + ::arrow::field( + "item", + ::arrow::struct_( + {::arrow::field( + "point", + ::arrow::struct_({::arrow::field("x", ::arrow::float32(), false), + ::arrow::field("y", ::arrow::int32(), true)}), + false), + ::arrow::field("z", ::arrow::int16(), true)}), + element_nullable), + /*size=*/2); +} + +std::shared_ptr
MakeVectorFixedSizeListNestedStructTable( + std::string_view json, bool nullable = true, bool element_nullable = false) { + auto type = VectorFixedSizeListNestedStructType(element_nullable); + auto field = ::arrow::field("root", type, nullable); + auto array = ::arrow::ArrayFromJSON(type, std::string(json)); + return ::arrow::Table::Make(::arrow::schema({field}), {array}); +} + +TEST(ArrowReadWrite, FixedSizeListVectorStructRoundTrip) { + auto table = MakeVectorFixedSizeListStructTable(R"([ + [{"x": 1.0, "y": 1}, {"x": 2.0, "y": 2}], + [{"x": 3.0, "y": 3}, {"x": 4.0, "y": 4}], + [{"x": 5.0, "y": 5}, {"x": 6.0, "y": 6}]])", + /*nullable=*/false); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorStructNullableRoundTrip) { + auto table = MakeVectorFixedSizeListStructTable(R"([ + [{"x": 1.0, "y": 1}, {"x": 2.0, "y": 2}], + null, + [{"x": 5.0, "y": 5}, {"x": 6.0, "y": 6}], + null])"); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorStructNullableFieldsRoundTrip) { + auto table = MakeVectorFixedSizeListStructTable(R"([ + [{"x": 1.0, "y": 1}, {"x": 2.0, "y": null}], + null, + [{"x": 5.0, "y": null}, {"x": 6.0, "y": 6}]])", + /*nullable=*/true, + /*element_nullable=*/false, + /*field_nullable=*/true); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableStructElementsRoundTrip) { + auto table = MakeVectorFixedSizeListStructTable(R"([ + [{"x": 1.0, "y": 1}, null], + null, + [null, {"x": 6.0, "y": null}], + [{"x": 7.0, "y": 7}, {"x": 8.0, "y": 8}]])", + /*nullable=*/true, + /*element_nullable=*/true, + /*field_nullable=*/true); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNestedStructRoundTrip) { + auto table = MakeVectorFixedSizeListNestedStructTable(R"([ + [{"point": {"x": 1.0, "y": 1}, "z": 10}, + {"point": {"x": 2.0, "y": null}, "z": null}], + [{"point": {"x": 3.0, "y": 3}, "z": 30}, + {"point": {"x": 4.0, "y": 4}, "z": 40}], + [{"point": {"x": 5.0, "y": null}, "z": 50}, + {"point": {"x": 6.0, "y": 6}, "z": 60}]])", + /*nullable=*/false, + /*element_nullable=*/false); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowReadWrite, FixedSizeListVectorNullableNestedStructElementsRoundTrip) { + auto table = MakeVectorFixedSizeListNestedStructTable(R"([ + [{"point": {"x": 1.0, "y": 1}, "z": 10}, null], + null, + [null, {"point": {"x": 4.0, "y": 4}, "z": 40}], + [{"point": {"x": 5.0, "y": null}, "z": null}, + {"point": {"x": 6.0, "y": 6}, "z": 60}]])", + /*nullable=*/true, + /*element_nullable=*/true); + + ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorRejectsDefaultWriterProperties) { + auto table = MakeVectorFixedSizeListTable(R"([ + [1, 2, 3], + [4, 5, 6]])", + /*nullable=*/false); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_RAISES(Invalid, + WriteTableToBuffer(table, /*row_group_size=*/2, + default_writer_properties(), builder.build())); +} + +TEST(ArrowReadWrite, FixedSizeListVectorByteStreamSplitRoundTrip) { + auto table = MakeVectorFixedSizeListTable(::arrow::float32(), R"([ + [1.0, 2.0, 3.0], + [4.0, 5.0, 6.0], + [7.0, 8.0, 9.0]])", + /*nullable=*/false); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN( + auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/2, + VectorByteStreamSplitWriterProperties(), builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + const auto encodings = + parquet_reader->metadata()->RowGroup(0)->ColumnChunk(0)->encodings(); + ASSERT_NE(std::find(encodings.begin(), encodings.end(), Encoding::BYTE_STREAM_SPLIT), + encodings.end()); + + FileReaderBuilder reader_builder; + ASSERT_OK(reader_builder.Open(std::make_shared(buffer))); + std::unique_ptr reader; + ASSERT_OK(reader_builder.Build(&reader)); + std::shared_ptr
result; + ASSERT_OK(reader->ReadTable(&result)); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), false); + ::arrow::AssertTablesEqual(*table, *result, false); +} + +TEST(ArrowReadWrite, FixedSizeListVectorByteStreamSplitNullableRoundTrip) { + auto table = MakeVectorFixedSizeListTable(::arrow::float32(), R"([ + [1.0, 2.0, 3.0], + null, + [7.0, 8.0, 9.0]])"); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoRoundtrip( + table, 3, &result, VectorByteStreamSplitWriterProperties(), builder.build())); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), false); + ::arrow::AssertTablesEqual(*table, *result, false); +} + TEST(ArrowReadWrite, ListOfStructOfList2) { using ::arrow::field; using ::arrow::list; @@ -4360,8 +5141,7 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { GTEST_SKIP() << "ARROW_TEST_DATA not set."; } std::stringstream ss; - ss << c_root << "/" - << "parquet/ARROW-17100.parquet"; + ss << c_root << "/" << "parquet/ARROW-17100.parquet"; std::string path = ss.str(); TryReadDataFile(path); } diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 7d9ecb5e6449..ed2d4cf290f9 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -289,6 +289,96 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } +TEST_F(TestConvertParquetSchema, VectorFixedSizeList) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto element = PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::FLOAT); + auto vector = GroupNode::Make("vector", Repetition::VECTOR, {element}, + /*logical_type=*/nullptr, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + arrow_fields.push_back(::arrow::field( + "embedding", + ::arrow::fixed_size_list(::arrow::field("element", ::arrow::float32(), false), 3), + true)); + + ASSERT_OK(ConvertSchema(parquet_fields)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(::arrow::schema(arrow_fields))); +} + +TEST_F(TestConvertParquetSchema, VectorFixedSizeListNullableStructElement) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto x = PrimitiveNode::Make("x", Repetition::REQUIRED, ParquetType::FLOAT); + auto y = PrimitiveNode::Make("y", Repetition::OPTIONAL, ParquetType::INT32); + auto item = GroupNode::Make("element", Repetition::OPTIONAL, {x, y}); + auto vector = GroupNode::Make("vector", Repetition::VECTOR, {item}, + /*logical_type=*/nullptr, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + arrow_fields.push_back(::arrow::field( + "embedding", + ::arrow::fixed_size_list( + ::arrow::field("element", + ::arrow::struct_({::arrow::field("x", FLOAT, false), + ::arrow::field("y", INT32, true)}), + true), + 3), + true)); + + ASSERT_OK(ConvertSchema(parquet_fields)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(::arrow::schema(arrow_fields))); +} + +TEST_F(TestConvertParquetSchema, VectorFixedSizeListNullableNestedStructElement) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto x = PrimitiveNode::Make("x", Repetition::REQUIRED, ParquetType::FLOAT); + auto y = PrimitiveNode::Make("y", Repetition::OPTIONAL, ParquetType::INT32); + auto point = GroupNode::Make("point", Repetition::REQUIRED, {x, y}); + auto z = PrimitiveNode::Make("z", Repetition::OPTIONAL, LogicalType::Int(16, true), + ParquetType::INT32); + auto item = GroupNode::Make("element", Repetition::OPTIONAL, {point, z}); + auto vector = GroupNode::Make("vector", Repetition::VECTOR, {item}, + /*logical_type=*/nullptr, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + arrow_fields.push_back(::arrow::field( + "embedding", + ::arrow::fixed_size_list( + ::arrow::field( + "element", + ::arrow::struct_( + {::arrow::field("point", + ::arrow::struct_({::arrow::field("x", FLOAT, false), + ::arrow::field("y", INT32, true)}), + false), + ::arrow::field("z", ::arrow::int16(), true)}), + true), + 3), + true)); + + ASSERT_OK(ConvertSchema(parquet_fields)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(::arrow::schema(arrow_fields))); +} + +TEST_F(TestConvertParquetSchema, VectorFixedSizeListStructWithListRejected) { + auto list_element = + PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32); + auto list = GroupNode::Make("list", Repetition::REPEATED, {list_element}); + auto items = + GroupNode::Make("items", Repetition::REQUIRED, {list}, LogicalType::List()); + auto vector = GroupNode::Make("vector", Repetition::VECTOR, {items}, + /*logical_type=*/nullptr, -1, 3); + std::vector parquet_fields = {GroupNode::Make( + "embedding", Repetition::OPTIONAL, {vector}, /*logical_type=*/nullptr)}; + + ASSERT_RAISES(NotImplemented, ConvertSchema(parquet_fields)); +} + TEST_F(TestConvertParquetSchema, DuplicateFieldNames) { std::vector parquet_fields; std::vector> arrow_fields; @@ -1746,12 +1836,8 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) { auto arrow_list = ::arrow::large_list(arrow_element); arrow_fields.push_back(::arrow::field("my_list", arrow_list, false)); } - // // FixedSizeList[10] (list-like non-null, elements nullable) - // required group my_list (LIST) { - // repeated group list { - // optional binary element (UTF8); - // } - // } + // FixedSizeList defaults to the legacy LIST encoding unless experimental VECTOR + // encoding is explicitly enabled. { auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::UTF8); @@ -1768,6 +1854,202 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); } +TEST_F(TestConvertArrowSchema, + ParquetFixedSizeListVectorUnsupportedElementFallsBackToList) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + parquet_fields.push_back( + GroupNode::Make("embedding", Repetition::OPTIONAL, {list}, ConvertedType::LIST)); + + auto arrow_element = ::arrow::field("string", UTF8, true); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 3); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVectorZeroLengthFallsBackToList) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto element = PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::FLOAT); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + parquet_fields.push_back( + GroupNode::Make("embedding", Repetition::OPTIONAL, {list}, ConvertedType::LIST)); + + auto arrow_element = ::arrow::field("element", FLOAT, false); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 0); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVectorMixedSupportedAndFallback) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto vector = PrimitiveNode::Make("element", Repetition::VECTOR, ParquetType::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + + auto string_element = PrimitiveNode::Make("element", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto string_list = GroupNode::Make("list", Repetition::REPEATED, {string_element}); + parquet_fields.push_back(GroupNode::Make("labels", Repetition::OPTIONAL, {string_list}, + ConvertedType::LIST)); + + arrow_fields.push_back(::arrow::field( + "embedding", ::arrow::fixed_size_list(::arrow::field("element", FLOAT, false), 3), + true)); + arrow_fields.push_back(::arrow::field( + "labels", ::arrow::fixed_size_list(::arrow::field("string", UTF8, true), 3), true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVectorNullableElement) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto item = PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, -1); + auto vector = GroupNode::Make("element", Repetition::VECTOR, {item}, + /*logical_type=*/nullptr, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + + auto arrow_element = ::arrow::field("element", FLOAT, true); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 3); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVectorNullableStructElement) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto x = PrimitiveNode::Make("x", Repetition::REQUIRED, ParquetType::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, -1); + auto y = PrimitiveNode::Make("y", Repetition::OPTIONAL, ParquetType::INT32, + ConvertedType::NONE, -1, -1, -1, -1, -1); + auto item = GroupNode::Make("element", Repetition::OPTIONAL, {x, y}); + auto vector = GroupNode::Make("element", Repetition::VECTOR, {item}, + /*logical_type=*/nullptr, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + + auto arrow_element = ::arrow::field( + "element", + ::arrow::struct_({::arrow::field("x", FLOAT, false), ::arrow::field("y", INT32)}), + true); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 3); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVectorNullableNestedStructElement) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto x = PrimitiveNode::Make("x", Repetition::REQUIRED, ParquetType::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, -1); + auto y = PrimitiveNode::Make("y", Repetition::OPTIONAL, ParquetType::INT32, + ConvertedType::NONE, -1, -1, -1, -1, -1); + auto point = GroupNode::Make("point", Repetition::REQUIRED, {x, y}); + auto z = PrimitiveNode::Make("z", Repetition::OPTIONAL, LogicalType::Int(16, true), + ParquetType::INT32, -1, -1, -1); + auto item = GroupNode::Make("element", Repetition::OPTIONAL, {point, z}); + auto vector = GroupNode::Make("element", Repetition::VECTOR, {item}, + /*logical_type=*/nullptr, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + + auto arrow_element = ::arrow::field( + "element", + ::arrow::struct_( + {::arrow::field("point", + ::arrow::struct_({::arrow::field("x", FLOAT, false), + ::arrow::field("y", INT32, true)}), + false), + ::arrow::field("z", ::arrow::int16(), true)}), + true); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 3); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVectorStructWithListFallsBackToList) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto item = PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::INT32); + auto items_list = GroupNode::Make("list", Repetition::REPEATED, {item}); + auto items = + GroupNode::Make("items", Repetition::REQUIRED, {items_list}, ConvertedType::LIST); + auto element = GroupNode::Make("element", Repetition::REQUIRED, {items}); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + parquet_fields.push_back( + GroupNode::Make("embedding", Repetition::OPTIONAL, {list}, ConvertedType::LIST)); + + auto arrow_element = ::arrow::field( + "element", + ::arrow::struct_({::arrow::field( + "items", ::arrow::list(::arrow::field("element", INT32, false)), false)}), + false); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 3); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ParquetFixedSizeListVector) { + std::vector parquet_fields; + std::vector> arrow_fields; + + auto vector = PrimitiveNode::Make("element", Repetition::VECTOR, ParquetType::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, 3); + parquet_fields.push_back(GroupNode::Make("embedding", Repetition::OPTIONAL, {vector}, + /*logical_type=*/nullptr)); + + auto arrow_element = ::arrow::field("element", FLOAT, false); + auto arrow_list = ::arrow::fixed_size_list(arrow_element, 3); + arrow_fields.push_back(::arrow::field("embedding", arrow_list, true)); + + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + ASSERT_OK(ConvertSchema(arrow_fields, builder.build())); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + TEST_F(TestConvertArrowSchema, ParquetNestedComplianceEnabledNullable) { std::vector parquet_fields; std::vector> arrow_fields; diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 002859a5e7d5..c131f80c244d 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -106,6 +106,7 @@ #include "arrow/util/macros.h" #include "arrow/visit_array_inline.h" +#include "parquet/arrow/schema_internal.h" #include "parquet/properties.h" namespace parquet::arrow { @@ -209,6 +210,17 @@ struct PathWriteContext { visited_elements.push_back(range); } + void RecordVectorPostListVisit(const ElementRange& logical_range, + const ElementRange& physical_range) { + if (!visited_elements.empty() && + visited_elements.back().start == logical_range.start && + visited_elements.back().end == logical_range.end) { + visited_elements.back() = physical_range; + return; + } + RecordPostListVisit(physical_range); + } + Status last_status; TypedBufferBuilder rep_levels; TypedBufferBuilder def_levels; @@ -450,6 +462,96 @@ struct FixedSizedRangeSelector { int list_size; }; +struct NoLevelTerminalNode { + IterationResult Run(const ElementRange&, PathWriteContext*) { return kDone; } +}; + +class VectorNullableNode { + public: + VectorNullableNode(const uint8_t* null_bitmap, int64_t entry_offset, + int32_t vector_length, int32_t child_physical_width, + int16_t def_level_if_present, bool child_emits_present_def_levels, + bool child_records_visited_elements) + : null_bitmap_(null_bitmap), + entry_offset_(entry_offset), + vector_length_(vector_length), + child_physical_width_(child_physical_width), + valid_bits_reader_(MakeReader(ElementRange{0, 0})), + def_level_if_present_(def_level_if_present), + def_level_if_null_(def_level_if_present - 1), + child_emits_present_def_levels_(child_emits_present_def_levels), + child_records_visited_elements_(child_records_visited_elements), + new_range_(true) {} + + ::arrow::internal::BitRunReader MakeReader(const ElementRange& range) { + return ::arrow::internal::BitRunReader(null_bitmap_, entry_offset_ + range.start, + range.Size()); + } + + IterationResult Run(ElementRange* range, ElementRange* child_range, + PathWriteContext* context) { + if (range->Empty()) { + new_range_ = true; + return kDone; + } + if (null_bitmap_ == nullptr) { + ElementRange logical_range = *range; + child_range->start = range->start * vector_length_; + child_range->end = child_range->start + range->Size() * vector_length_; + if (!child_records_visited_elements_) { + context->RecordVectorPostListVisit(logical_range, *child_range); + } + if (!child_emits_present_def_levels_) { + RETURN_IF_ERROR( + context->AppendDefLevels(child_range->Size(), def_level_if_present_)); + } + range->start = range->end; + new_range_ = false; + return kNext; + } + if (new_range_) { + valid_bits_reader_ = MakeReader(*range); + } + ::arrow::internal::BitRun run = valid_bits_reader_.NextRun(); + while (!range->Empty() && !run.set) { + range->start += run.length; + RETURN_IF_ERROR(context->AppendDefLevels( + run.length * vector_length_ * child_physical_width_, def_level_if_null_)); + run = valid_bits_reader_.NextRun(); + } + if (range->Empty()) { + new_range_ = true; + return kDone; + } + + ElementRange logical_range{range->start, range->start + run.length}; + child_range->start = range->start * vector_length_; + child_range->end = child_range->start + run.length * vector_length_; + if (!child_records_visited_elements_) { + context->RecordVectorPostListVisit(logical_range, *child_range); + } + if (!child_emits_present_def_levels_) { + RETURN_IF_ERROR( + context->AppendDefLevels(run.length * vector_length_, def_level_if_present_)); + } + range->start += run.length; + new_range_ = false; + return kNext; + } + + private: + const uint8_t* null_bitmap_; + int64_t entry_offset_; + int32_t vector_length_; + int32_t child_physical_width_; + ::arrow::internal::BitRunReader valid_bits_reader_; + int16_t def_level_if_present_; + int16_t def_level_if_null_; + bool child_emits_present_def_levels_; + bool child_records_visited_elements_; + bool new_range_ = true; +}; + // An intermediate node that handles null values. class NullableNode { public: @@ -512,6 +614,15 @@ using ListNode = ListPathNode>; using LargeListNode = ListPathNode>; using FixedSizeListNode = ListPathNode; +int32_t VectorPhysicalWidth(const ::arrow::DataType& type) { + if (type.id() != ::arrow::Type::FIXED_SIZE_LIST) { + return 1; + } + const auto& list_type = + ::arrow::internal::checked_cast(type); + return list_type.list_size() * VectorPhysicalWidth(*list_type.value_type()); +} + // Contains static information derived from traversing the schema. struct PathInfo { // The vectors are expected to the same length info. @@ -519,7 +630,8 @@ struct PathInfo { // Note index order matters here. using Node = std::variant; + NullableNode, VectorNullableNode, AllPresentTerminalNode, + AllNullsTerminalNode, NoLevelTerminalNode>; std::vector path; std::shared_ptr primitive_array; @@ -527,6 +639,8 @@ struct PathInfo { int16_t max_rep_level = 0; bool has_dictionary = false; bool leaf_is_nullable = false; + bool leaf_is_vector = false; + int32_t leaf_vector_length = 1; }; /// Contains logic for writing a single leaf node to parquet. @@ -536,6 +650,37 @@ struct PathInfo { /// values have been calculated for root_range with the calculated /// values. It is intended to abstract the complexity of writing /// the levels and values to parquet. +void ExpandVectorRepLevels(const int16_t* levels, int64_t length, int32_t multiplier, + int16_t repeated_level, std::vector* out) { + out->clear(); + out->reserve(static_cast(length * multiplier)); + for (int64_t i = 0; i < length; ++i) { + out->push_back(levels[i]); + out->insert(out->end(), multiplier - 1, repeated_level); + } +} + +Status ExpandVectorRepLevels(PathInfo* path_info, PathWriteContext* context, + MultipathLevelBuilderResult* builder_result) { + if (!path_info->leaf_is_vector || path_info->leaf_vector_length <= 1 || + context->rep_levels.length() == 0) { + return Status::OK(); + } + const int64_t rep_length = context->rep_levels.length(); + const int64_t def_length = context->def_levels.length(); + if (def_length != rep_length * path_info->leaf_vector_length) { + return Status::Invalid("VECTOR repetition level expansion mismatch: repetition " + "level count ", + rep_length, ", definition level count ", def_length, + ", vector length ", path_info->leaf_vector_length); + } + ExpandVectorRepLevels(context->rep_levels.data(), rep_length, + path_info->leaf_vector_length, path_info->max_rep_level, + &builder_result->owned_rep_levels); + builder_result->rep_levels = builder_result->owned_rep_levels.data(); + return Status::OK(); +} + Status WritePath(ElementRange root_range, PathInfo* path_info, ArrowWriteContext* arrow_context, MultipathLevelBuilder::CallbackFunction writer) { @@ -543,6 +688,8 @@ Status WritePath(ElementRange root_range, PathInfo* path_info, MultipathLevelBuilderResult builder_result; builder_result.leaf_array = path_info->primitive_array; builder_result.leaf_is_nullable = path_info->leaf_is_nullable; + builder_result.leaf_is_vector = path_info->leaf_is_vector; + builder_result.leaf_vector_length = path_info->leaf_vector_length; if (path_info->max_def_level == 0) { // This case only occurs when there are no nullable or repeated @@ -597,6 +744,12 @@ Status WritePath(ElementRange root_range, PathInfo* path_info, IterationResult operator()(LargeListNode& node) { return node.Run(stack_position, stack_position + 1, context); } + IterationResult operator()(VectorNullableNode& node) { + return node.Run(stack_position, stack_position + 1, context); + } + IterationResult operator()(NoLevelTerminalNode& node) { + return node.Run(*stack_position, context); + } ElementRange* stack_position; PathWriteContext* context; } visitor = {stack_position, &context}; @@ -624,12 +777,17 @@ Status WritePath(ElementRange root_range, PathInfo* path_info, builder_result.post_list_visited_elements.push_back({0, 0}); } } else { - builder_result.post_list_visited_elements.push_back( - {0, builder_result.leaf_array->length()}); + if (!context.visited_elements.empty()) { + std::swap(builder_result.post_list_visited_elements, context.visited_elements); + } else { + builder_result.post_list_visited_elements.push_back( + {0, builder_result.leaf_array->length()}); + } builder_result.rep_levels = nullptr; } builder_result.def_levels = context.def_levels.data(); + RETURN_NOT_OK(ExpandVectorRepLevels(path_info, &context, &builder_result)); return writer(builder_result); } @@ -661,6 +819,7 @@ struct FixupVisitor { } void operator()(NullableNode& arg) { HandleIntermediateNode(arg); } + void operator()(VectorNullableNode&) {} void operator()(AllNullsTerminalNode& arg) { // Even though no processing happens past this point we @@ -671,6 +830,7 @@ struct FixupVisitor { void operator()(NullableTerminalNode&) {} void operator()(AllPresentTerminalNode&) {} + void operator()(NoLevelTerminalNode&) {} }; PathInfo Fixup(PathInfo info) { @@ -692,7 +852,9 @@ PathInfo Fixup(PathInfo info) { class PathBuilder { public: - explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {} + PathBuilder(bool start_nullable, bool write_fixed_size_list_as_vector) + : nullable_in_parent_(start_nullable), + write_fixed_size_list_as_vector_(write_fixed_size_list_as_vector) {} template void AddTerminalInfo(const T& array) { info_.leaf_is_nullable = nullable_in_parent_; @@ -801,8 +963,57 @@ class PathBuilder { } Status Visit(const ::arrow::FixedSizeListArray& array) { - MaybeAddNullable(array); int32_t list_size = array.list_type()->list_size(); + if (write_fixed_size_list_as_vector_ && list_size > 0 && + IsSupportedVectorElementType(*array.value_type())) { + const bool element_nullable = array.list_type()->value_field()->nullable(); + const bool nested_vector_path = info_.leaf_is_vector; + info_.leaf_is_vector = true; + info_.leaf_vector_length *= list_size; + const bool parent_nullable = nullable_in_parent_; + if (parent_nullable) { + info_.max_def_level++; + } + const bool value_type_is_struct = array.value_type()->id() == ::arrow::Type::STRUCT; + const bool value_type_is_nested_vector = + array.value_type()->id() == ::arrow::Type::FIXED_SIZE_LIST; + const bool child_emits_present_def_levels = element_nullable || + value_type_is_struct || + value_type_is_nested_vector || + nested_vector_path; + const bool child_records_visited_elements = value_type_is_nested_vector; + const bool has_vector_node = parent_nullable || element_nullable || + value_type_is_struct || value_type_is_nested_vector || + nested_vector_path || info_.max_rep_level > 0; + const bool vector_node_emits_def_levels = + has_vector_node && !child_emits_present_def_levels; + if (has_vector_node) { + info_.path.emplace_back(VectorNullableNode( + parent_nullable ? array.null_bitmap_data() : nullptr, array.offset(), + list_size, VectorPhysicalWidth(*array.value_type()), info_.max_def_level, + child_emits_present_def_levels, child_records_visited_elements)); + } + auto values = + array.values()->Slice(array.value_offset(0), array.length() * list_size); + if (!nested_vector_path && !element_nullable && + !::arrow::is_nested(*array.value_type())) { + info_.leaf_is_nullable = false; + info_.primitive_array = values; + if (!vector_node_emits_def_levels && info_.max_rep_level > 0 && + info_.max_def_level > 0) { + info_.path.emplace_back(AllPresentTerminalNode{info_.max_def_level}); + } else { + info_.path.emplace_back(NoLevelTerminalNode{}); + } + paths_.push_back(Fixup(info_)); + return Status::OK(); + } + nullable_in_parent_ = element_nullable; + RETURN_NOT_OK(VisitInline(*values)); + return Status::OK(); + } + + MaybeAddNullable(array); // Technically we could encode fixed size lists with two level encodings // but since we always use 3 level encoding we increment def levels as // well. @@ -840,6 +1051,7 @@ class PathBuilder { PathInfo info_; std::vector paths_; bool nullable_in_parent_; + bool write_fixed_size_list_as_vector_; }; Status PathBuilder::VisitInline(const Array& array) { @@ -883,8 +1095,10 @@ class MultipathLevelBuilderImpl : public MultipathLevelBuilder { // static ::arrow::Result> MultipathLevelBuilder::Make( - const ::arrow::Array& array, bool array_field_nullable) { - auto constructor = std::make_unique(array_field_nullable); + const ::arrow::Array& array, bool array_field_nullable, + bool write_fixed_size_list_as_vector) { + auto constructor = std::make_unique(array_field_nullable, + write_fixed_size_list_as_vector); RETURN_NOT_OK(VisitArrayInline(array, constructor.get())); return std::make_unique(array.data(), std::move(constructor)); @@ -894,8 +1108,12 @@ ::arrow::Result> MultipathLevelBuilder::M Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullable, ArrowWriteContext* context, MultipathLevelBuilder::CallbackFunction callback) { + const bool write_fixed_size_list_as_vector = + context->properties != nullptr && + context->properties->write_fixed_size_list_as_vector(); ARROW_ASSIGN_OR_RAISE(std::unique_ptr builder, - MultipathLevelBuilder::Make(array, array_field_nullable)); + MultipathLevelBuilder::Make(array, array_field_nullable, + write_fixed_size_list_as_vector)); for (int leaf_idx = 0; leaf_idx < builder->GetLeafCount(); leaf_idx++) { RETURN_NOT_OK(builder->Write(leaf_idx, context, callback)); } diff --git a/cpp/src/parquet/arrow/path_internal.h b/cpp/src/parquet/arrow/path_internal.h index 50d2bf24291a..e862dfc8face 100644 --- a/cpp/src/parquet/arrow/path_internal.h +++ b/cpp/src/parquet/arrow/path_internal.h @@ -94,6 +94,20 @@ struct MultipathLevelBuilderResult { /// Whether the leaf array is nullable. bool leaf_is_nullable; + + /// Whether this leaf is produced from an Arrow FixedSizeList being written as + /// Parquet VECTOR. For nullable VECTOR elements the physical Parquet leaf is + /// below the VECTOR node, so checking only the primitive schema node is not + /// sufficient. + bool leaf_is_vector = false; + + /// Fixed number of physical leaf slots per logical VECTOR value for this leaf. + int32_t leaf_vector_length = 1; + + /// Owned repetition-level storage used when VECTOR post-processing must expand + /// levels produced by repeated ancestors from logical vector slots to physical + /// leaf slots. + std::vector owned_rep_levels; }; /// \brief Logic for being able to write out nesting (rep/def level) data that is @@ -132,7 +146,8 @@ class PARQUET_EXPORT MultipathLevelBuilder { /// the array column as nullable (as determined by its type's parent /// field). static ::arrow::Result> Make( - const ::arrow::Array& array, bool array_field_nullable); + const ::arrow::Array& array, bool array_field_nullable, + bool write_fixed_size_list_as_vector = false); virtual ~MultipathLevelBuilder() = default; diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index 0145e889ddaf..09c6acd3732b 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -551,6 +551,88 @@ TEST_F(MultipathLevelBuilderTest, TestFixedSizeList) { EXPECT_THAT(results_[0].post_list_elements[0].end, Eq(6)); } +TEST_F(MultipathLevelBuilderTest, TestFixedSizeListExperimentalVector) { + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + arrow_properties_ = builder.build(); + context_ = ArrowWriteContext(default_memory_pool(), arrow_properties_.get()); + + auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); + auto list_type = fixed_size_list(entries, 2); + auto array = ArrayFromJSON(list_type, "[[1, 2], [3, 4], [5, 6]]"); + + ASSERT_OK( + MultipathLevelBuilder::Write(*array, /*nullable=*/false, &context_, callback_)); + ASSERT_THAT(results_, SizeIs(1)); + EXPECT_TRUE(results_[0].null_rep_levels); + EXPECT_TRUE(results_[0].null_def_levels); + ASSERT_THAT(results_[0].post_list_elements, SizeIs(1)); + EXPECT_EQ(results_[0].post_list_elements[0].start, 0); + EXPECT_EQ(results_[0].post_list_elements[0].end, 6); +} + +TEST_F(MultipathLevelBuilderTest, TestFixedSizeListExperimentalVectorNullableElements) { + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + arrow_properties_ = builder.build(); + context_ = ArrowWriteContext(default_memory_pool(), arrow_properties_.get()); + + auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); + auto list_type = fixed_size_list(entries, 2); + auto array = ArrayFromJSON(list_type, "[[1, null], [3, 4], [null, 6]]"); + + ASSERT_OK( + MultipathLevelBuilder::Write(*array, /*nullable=*/false, &context_, callback_)); + ASSERT_THAT(results_, SizeIs(1)); + results_[0].CheckLevelsWithNullRepLevels(std::vector{1, 0, 1, 1, 0, 1}); + ASSERT_THAT(results_[0].post_list_elements, SizeIs(1)); + EXPECT_EQ(results_[0].post_list_elements[0].start, 0); + EXPECT_EQ(results_[0].post_list_elements[0].end, 6); +} + +TEST_F(MultipathLevelBuilderTest, + TestFixedSizeListExperimentalVectorNullableRowsAndElements) { + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + arrow_properties_ = builder.build(); + context_ = ArrowWriteContext(default_memory_pool(), arrow_properties_.get()); + + auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); + auto list_type = fixed_size_list(entries, 2); + auto array = ArrayFromJSON(list_type, "[[1, null], null, [null, 6]]"); + + ASSERT_OK( + MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_THAT(results_, SizeIs(1)); + results_[0].CheckLevelsWithNullRepLevels(std::vector{2, 1, 0, 0, 1, 2}); + ASSERT_THAT(results_[0].post_list_elements, SizeIs(2)); + EXPECT_EQ(results_[0].post_list_elements[0].start, 0); + EXPECT_EQ(results_[0].post_list_elements[0].end, 2); + EXPECT_EQ(results_[0].post_list_elements[1].start, 4); + EXPECT_EQ(results_[0].post_list_elements[1].end, 6); +} + +TEST_F(MultipathLevelBuilderTest, TestFixedSizeListExperimentalVectorNullable) { + ArrowWriterProperties::Builder builder; + builder.enable_experimental_vector_encoding(); + arrow_properties_ = builder.build(); + context_ = ArrowWriteContext(default_memory_pool(), arrow_properties_.get()); + + auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); + auto list_type = fixed_size_list(entries, 2); + auto array = ArrayFromJSON(list_type, "[[1, 2], null, [5, 6]]"); + + ASSERT_OK( + MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_THAT(results_, SizeIs(1)); + results_[0].CheckLevelsWithNullRepLevels(std::vector{1, 1, 0, 0, 1, 1}); + ASSERT_THAT(results_[0].post_list_elements, SizeIs(2)); + EXPECT_EQ(results_[0].post_list_elements[0].start, 0); + EXPECT_EQ(results_[0].post_list_elements[0].end, 2); + EXPECT_EQ(results_[0].post_list_elements[1].start, 4); + EXPECT_EQ(results_[0].post_list_elements[1].end, 6); +} + TEST_F(MultipathLevelBuilderTest, TestFixedSizeListMissingMiddleHasTwoVisitedRanges) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); auto list_type = fixed_size_list(entries, 2); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index a60af69aec9f..091d536b32ee 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -26,6 +26,8 @@ #include #include "arrow/array.h" +#include "arrow/array/concatenate.h" +#include "arrow/array/util.h" #include "arrow/buffer.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" @@ -126,6 +128,7 @@ class ColumnReaderImpl : public ColumnReader { virtual ::arrow::Status BuildArray(int64_t length_upper_bound, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; virtual bool IsOrHasRepeatedChild() const = 0; + virtual bool is_vector() const { return false; } }; namespace { @@ -273,10 +276,19 @@ class FileReaderImpl : public FileReader { // TODO(wesm): This calculation doesn't make much sense when we have repeated // schema nodes int64_t records_to_read = 0; + DCHECK(dynamic_cast(reader) != nullptr); + const bool is_vector = static_cast(reader)->is_vector(); for (auto row_group : row_groups) { - // Can throw exception - records_to_read += - reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values(); + // Can throw exception. ColumnReader::NextBatch takes logical parent records. + // VECTOR columns still store row_count * vector_length physical child slots + // (including null child slots for null parent rows), but the VECTOR reader + // multiplies the requested parent count by list_size_ internally. + if (is_vector) { + records_to_read += reader_->metadata()->RowGroup(row_group)->num_rows(); + } else { + records_to_read += + reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values(); + } } #ifdef ARROW_WITH_OPENTELEMETRY std::string column_name = reader_->metadata()->schema()->Column(i)->name(); @@ -575,6 +587,8 @@ class ExtensionReader : public ColumnReaderImpl { return storage_reader_->IsOrHasRepeatedChild(); } + bool is_vector() const final { return storage_reader_->is_vector(); } + const std::shared_ptr field() override { return field_; } private: @@ -603,6 +617,8 @@ class ListReader : public ColumnReaderImpl { bool IsOrHasRepeatedChild() const final { return true; } + bool is_vector() const final { return false; } + Status LoadBatch(int64_t number_of_records) final { return item_reader_->LoadBatch(number_of_records); } @@ -706,6 +722,173 @@ class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader { } }; +// Reads Parquet VECTOR columns into Arrow FixedSizeList arrays. +// +// VECTOR stores one definition level per element even though the public Arrow result is a +// single FixedSizeList slot per row. For nullable VECTOR rows, the child reader therefore +// materializes spaced child slots and this reader collapses each vector's per-element def +// levels back into a parent validity bitmap. +class PARQUET_NO_EXPORT VectorFixedSizeListReader : public ColumnReaderImpl { + public: + VectorFixedSizeListReader(std::shared_ptr ctx, + std::shared_ptr field, + ::parquet::internal::LevelInfo level_info, + std::unique_ptr child_reader) + : ctx_(std::move(ctx)), + field_(std::move(field)), + level_info_(level_info), + item_reader_(std::move(child_reader)), + list_size_(checked_cast(*field_->type()) + .list_size()) {} + + Status GetDefLevels(const int16_t** data, int64_t* length) override { + if (collapsed_def_levels_.empty()) { + *data = nullptr; + *length = rows_loaded_; + } else { + *data = collapsed_def_levels_.data(); + *length = rows_loaded_; + } + return Status::OK(); + } + + Status GetRepLevels(const int16_t** data, int64_t* length) override { + if (collapsed_rep_levels_.empty()) { + *data = nullptr; + } else { + *data = collapsed_rep_levels_.data(); + } + *length = rows_loaded_; + return Status::OK(); + } + + bool IsOrHasRepeatedChild() const final { return false; } + + bool is_vector() const final { return true; } + + Status LoadBatch(int64_t number_of_records) final { + rows_loaded_ = 0; + collapsed_def_levels_.clear(); + collapsed_rep_levels_.clear(); + const int64_t child_records_to_read = + level_info_.rep_level > 0 ? number_of_records : number_of_records * list_size_; + RETURN_NOT_OK(item_reader_->LoadBatch(child_records_to_read)); + + const int16_t* def_levels = nullptr; + int64_t num_levels = 0; + RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels)); + const int16_t* rep_levels = nullptr; + int64_t num_rep_levels = 0; + RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_rep_levels)); + if (rep_levels != nullptr && num_rep_levels != num_levels) { + return Status::Invalid("VECTOR child produced ", num_rep_levels, + " repetition levels for ", num_levels, + " definition levels"); + } + if (num_levels % list_size_ != 0) { + return Status::Invalid("VECTOR column produced a non-multiple of vector length ", + num_levels, " for list_size=", list_size_); + } + rows_loaded_ = num_levels / list_size_; + + if (rep_levels != nullptr) { + collapsed_rep_levels_.reserve(static_cast(rows_loaded_)); + for (int64_t row = 0; row < rows_loaded_; ++row) { + const int64_t first_level_index = row * list_size_; + collapsed_rep_levels_.push_back(rep_levels[first_level_index]); + } + } + + if (def_levels != nullptr) { + collapsed_def_levels_.reserve(static_cast(rows_loaded_)); + for (int64_t row = 0; row < rows_loaded_; ++row) { + const int64_t first_level_index = row * list_size_; + const bool first_is_present = + def_levels[first_level_index] >= level_info_.def_level; + for (int32_t i = 1; i < list_size_; ++i) { + const bool slot_is_present = + def_levels[first_level_index + i] >= level_info_.def_level; + if (slot_is_present != first_is_present) { + return Status::Invalid( + "VECTOR parent validity changed within one fixed-size vector at row ", + row, + "; null VECTOR rows must still emit exactly list_size null child " + "slots and all slots for one parent must agree on parent " + "validity"); + } + } + collapsed_def_levels_.push_back(first_is_present ? level_info_.def_level + : def_levels[first_level_index]); + } + } + return Status::OK(); + } + + Status BuildArray(int64_t length_upper_bound, + std::shared_ptr* out) override { + std::shared_ptr child_out; + RETURN_NOT_OK(item_reader_->BuildArray(length_upper_bound * list_size_, &child_out)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr child_data, + ChunksToSingle(*child_out)); + std::shared_ptr child_array = ::arrow::MakeArray(child_data); + if (rows_loaded_ == 0) { + if (child_array->length() % list_size_ != 0) { + return Status::Invalid("VECTOR FixedSizeList child length ", + child_array->length(), + " was not divisible by list_size=", list_size_); + } + rows_loaded_ = child_array->length() / list_size_; + } + + if (child_array->length() != rows_loaded_ * list_size_) { + return Status::Invalid("VECTOR FixedSizeList child length ", child_array->length(), + " did not match expected ", rows_loaded_ * list_size_); + } + + std::shared_ptr validity_buffer; + int64_t null_count = 0; + if (field_->nullable()) { + ARROW_ASSIGN_OR_RAISE( + validity_buffer, + AllocateResizableBuffer(bit_util::BytesForBits(rows_loaded_), ctx_->pool)); + memset(validity_buffer->mutable_data(), 0, + static_cast(bit_util::BytesForBits(rows_loaded_))); + if (collapsed_def_levels_.empty()) { + bit_util::SetBitsTo(validity_buffer->mutable_data(), 0, rows_loaded_, true); + } else { + for (int64_t row = 0; row < rows_loaded_; ++row) { + if (collapsed_def_levels_[row] == level_info_.def_level) { + bit_util::SetBit(validity_buffer->mutable_data(), row); + } else { + ++null_count; + } + } + } + validity_buffer->ZeroPadding(); + } + + auto data = std::make_shared( + field_->type(), rows_loaded_, + std::vector>{null_count > 0 ? validity_buffer : nullptr}, + null_count); + data->child_data.push_back(child_data); + *out = std::make_shared(::arrow::MakeArray(std::move(data))); + return Status::OK(); + } + + const std::shared_ptr field() override { return field_; } + + private: + std::shared_ptr ctx_; + std::shared_ptr field_; + ::parquet::internal::LevelInfo level_info_; + std::unique_ptr item_reader_; + int32_t list_size_; + int64_t rows_loaded_ = 0; + std::vector collapsed_def_levels_; + std::vector collapsed_rep_levels_; +}; + class PARQUET_NO_EXPORT StructReader : public ColumnReaderImpl { public: explicit StructReader(std::shared_ptr ctx, @@ -884,8 +1067,12 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& arrow_f } std::unique_ptr input( ctx->iterator_factory(field.column_index, ctx->reader)); - *out = std::make_unique(ctx, arrow_field, std::move(input), - field.level_info); + auto leaf_field = arrow_field; + if (field.is_vector && field.level_info.def_level > 0) { + leaf_field = leaf_field->WithNullable(true); + } + *out = + std::make_unique(ctx, leaf_field, std::move(input), field.level_info); } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP || type_id == ::arrow::Type::FIXED_SIZE_LIST || type_id == ::arrow::Type::LARGE_LIST) { @@ -950,8 +1137,16 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& arrow_f list_field->WithType(::arrow::fixed_size_list(reader_child_type, list_size)); } - *out = std::make_unique(ctx, list_field, field.level_info, - std::move(child_reader)); + if (field.is_vector) { + if (child->is_leaf() && child->column_index >= 0 && list_field->nullable()) { + DCHECK(child_reader->field()->nullable()); + } + *out = std::make_unique( + ctx, list_field, field.level_info, std::move(child_reader)); + } else { + *out = std::make_unique(ctx, list_field, field.level_info, + std::move(child_reader)); + } } else { return Status::UnknownError("Unknown list type: ", field.field->ToString()); } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 9c4c462c6b8c..3c0e502a2f57 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -30,6 +30,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/base64.h" #include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" @@ -111,6 +112,130 @@ Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type, return Status::OK(); } +bool IsSupportedVectorStructNode(const Node& node) { + if (node.is_primitive()) { + return true; + } + if (!node.is_group() || node.is_repeated()) { + return false; + } + const auto& group = checked_cast(node); + if (group.logical_type() != nullptr && !group.logical_type()->is_none()) { + return false; + } + for (int i = 0; i < group.field_count(); ++i) { + if (!IsSupportedVectorStructNode(*group.field(i))) { + return false; + } + } + return true; +} + +Status ValidateSupportedVectorStructNode(const Node& node) { + if (IsSupportedVectorStructNode(node)) { + return Status::OK(); + } + return Status::NotImplemented( + "VECTOR elements only support primitive, nested VECTOR, or nested struct fields " + "in the current prototype; repeated/list/map descendants are deferred"); +} + +Status FixedSizeListValueToVectorNode( + const std::shared_ptr<::arrow::Field>& value_field, const std::string& value_name, + int32_t list_size, const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, NodePtr* out) { + if (list_size <= 0) { + return Status::NotImplemented( + "VECTOR repetition does not support zero-length FixedSizeList values"); + } + if (!IsSupportedVectorElementType(*value_field->type())) { + return Status::NotImplemented( + "VECTOR repetition only supports fixed-width primitive, nested FixedSizeList, " + "or struct FixedSizeList elements"); + } + + if (value_field->type()->id() == ::arrow::Type::FIXED_SIZE_LIST) { + const auto& nested_type = checked_cast( + *value_field->type()); + const std::string nested_value_name = + arrow_properties.compliant_nested_types() ? "element" + : nested_type.value_field()->name(); + NodePtr nested_vector; + RETURN_NOT_OK(FixedSizeListValueToVectorNode( + nested_type.value_field(), nested_value_name, nested_type.list_size(), properties, + arrow_properties, &nested_vector)); + if (value_field->nullable()) { + NodePtr optional_element = + GroupNode::Make(value_name, Repetition::OPTIONAL, {nested_vector}, + /*logical_type=*/nullptr, /*field_id=*/-1); + *out = GroupNode::Make("element", Repetition::VECTOR, {optional_element}, + /*logical_type=*/nullptr, /*field_id=*/-1, list_size); + } else { + *out = GroupNode::Make(value_name, Repetition::VECTOR, {nested_vector}, + /*logical_type=*/nullptr, /*field_id=*/-1, list_size); + } + return Status::OK(); + } + + if (value_field->type()->num_fields() == 0) { + NodePtr element; + RETURN_NOT_OK( + FieldToNode(value_name, value_field, properties, arrow_properties, &element)); + if (value_field->nullable()) { + // A node cannot be both VECTOR and OPTIONAL. Nullable primitive elements use + // a VECTOR group with an optional primitive descendant. + *out = GroupNode::Make(value_name, Repetition::VECTOR, {element}, + /*logical_type=*/nullptr, /*field_id=*/-1, list_size); + } else { + const auto& primitive = checked_cast(*element); + *out = PrimitiveNode::Make(value_name, Repetition::VECTOR, + primitive.logical_type(), primitive.physical_type(), + primitive.type_length(), primitive.field_id(), list_size); + } + return Status::OK(); + } + + const auto& struct_type = checked_cast(*value_field->type()); + std::vector children(struct_type.num_fields()); + for (int i = 0; i < struct_type.num_fields(); ++i) { + RETURN_NOT_OK(FieldToNode(struct_type.field(i)->name(), struct_type.field(i), + properties, arrow_properties, &children[i])); + } + if (value_field->nullable()) { + NodePtr struct_element = GroupNode::Make(value_name, Repetition::OPTIONAL, children, + /*logical_type=*/nullptr, /*field_id=*/-1); + *out = GroupNode::Make("element", Repetition::VECTOR, {struct_element}, + /*logical_type=*/nullptr, /*field_id=*/-1, list_size); + } else { + *out = GroupNode::Make(value_name, Repetition::VECTOR, children, + /*logical_type=*/nullptr, /*field_id=*/-1, list_size); + } + return Status::OK(); +} + +// Option B mapping for Arrow FixedSizeList -> Parquet VECTOR. +// +// VECTOR is a structural repetition type. No LogicalType::VECTOR annotation is +// required: vector nullability is carried by the optional parent group, while the +// VECTOR child carries vector_length and contributes no repetition/definition level. +Status FixedSizeListToNode(const std::shared_ptr<::arrow::FixedSizeListType>& type, + const std::string& name, bool nullable, int field_id, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, NodePtr* out) { + const auto& value_field = type->value_field(); + const std::string value_name = + arrow_properties.compliant_nested_types() ? "element" : value_field->name(); + + NodePtr vector; + RETURN_NOT_OK(FixedSizeListValueToVectorNode(value_field, value_name, + type->list_size(), properties, + arrow_properties, &vector)); + RETURN_NOT_OK(ValidateSupportedVectorStructNode(*vector)); + *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {vector}, + /*logical_type=*/nullptr, field_id); + return Status::OK(); +} + Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::string& name, bool nullable, int field_id, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, NodePtr* out) { @@ -453,7 +578,32 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, return StructToNode(struct_type, name, field->nullable(), field_id, properties, arrow_properties, out); } - case ArrowTypeId::FIXED_SIZE_LIST: + case ArrowTypeId::FIXED_SIZE_LIST: { + auto list_type = + std::static_pointer_cast<::arrow::FixedSizeListType>(field->type()); + // Experimental VECTOR encoding is opportunistic: use it for FixedSizeList + // fields supported by this Option B prototype, and preserve writability by + // falling back to the standard LIST encoding for unsupported fields. This + // lets schemas mix VECTOR-friendly dense numeric/struct vectors with + // FixedSizeList values that still need LIST machinery (for example strings). + if (arrow_properties.write_fixed_size_list_as_vector() && + list_type->list_size() > 0 && + IsSupportedVectorElementType(*list_type->value_type())) { + Status vector_status = + FixedSizeListToNode(list_type, name, field->nullable(), field_id, properties, + arrow_properties, out); + if (vector_status.ok()) { + return vector_status; + } + if (!vector_status.IsNotImplemented()) { + return vector_status; + } + } + auto base_list_type = + std::static_pointer_cast<::arrow::BaseListType>(field->type()); + return ListToNode(base_list_type, name, field->nullable(), field_id, properties, + arrow_properties, out); + } case ArrowTypeId::LARGE_LIST: case ArrowTypeId::LIST: { auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type()); @@ -574,6 +724,13 @@ Status PopulateLeaf(int column_index, const std::shared_ptr& field, return Status::OK(); } +void MarkVectorSubtree(SchemaField* field) { + field->is_vector = true; + for (auto& child : field->children) { + MarkVectorSubtree(&child); + } +} + // Special case mentioned in the format spec: // If the name is array or uses the parent's name with `_tuple` appended, // this should be: @@ -599,7 +756,7 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, arrow_fields.push_back(out->children[i].field); } auto struct_type = ::arrow::struct_(arrow_fields); - if (ctx->properties.get_arrow_extensions_enabled() && + if (ctx->properties.get_arrow_extensions_enabled() && node.logical_type() != nullptr && node.logical_type()->is_variant()) { auto extension_type = ::arrow::GetExtensionType("arrow.parquet.variant"); if (extension_type) { @@ -618,6 +775,177 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, SchemaTreeContext* ctx, const SchemaField* parent, SchemaField* out); +bool IsVectorGroup(const GroupNode& group) { + return group.field_count() == 1 && group.field(0)->is_vector(); +} + +Status VectorNodeToSchemaField(const Node& vector_node, LevelInfo current_levels, + SchemaTreeContext* ctx, const SchemaField* parent, + SchemaField* out) { + if (!vector_node.is_vector()) { + return Status::Invalid("Expected VECTOR node"); + } + const LevelInfo vector_level = current_levels; + + out->children.resize(1); + SchemaField* child_field = &out->children[0]; + ctx->LinkParent(out, parent); + ctx->LinkParent(child_field, out); + + if (vector_node.is_primitive()) { + const auto& primitive_node = static_cast(vector_node); + int column_index = ctx->schema->GetColumnIndex(primitive_node); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); + auto item_field = ::arrow::field(vector_node.name(), type, /*nullable=*/false, + FieldIdMetadata(vector_node.field_id())); + RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, + child_field)); + } else { + const auto& vector_group = static_cast(vector_node); + if (vector_group.field_count() == 1 && vector_group.field(0)->is_vector()) { + RETURN_NOT_OK(VectorNodeToSchemaField(*vector_group.field(0), current_levels, ctx, + out, child_field)); + } else if (vector_group.field_count() == 1 && + vector_group.field(0)->is_group() && + vector_group.field(0)->is_optional()) { + const auto& optional_group = static_cast(*vector_group.field(0)); + if (optional_group.field_count() == 1 && optional_group.field(0)->is_vector()) { + current_levels.IncrementOptional(); + RETURN_NOT_OK(VectorNodeToSchemaField(*optional_group.field(0), current_levels, + ctx, out, child_field)); + child_field->field = child_field->field->WithNullable(true); + } else { + RETURN_NOT_OK(ValidateSupportedVectorStructNode(optional_group)); + current_levels.IncrementOptional(); + RETURN_NOT_OK(GroupToStruct(optional_group, current_levels, ctx, out, + child_field)); + } + } else if (vector_group.field_count() == 1 && + vector_group.field(0)->is_primitive()) { + const Node& element = *vector_group.field(0); + bool element_nullable = element.is_optional(); + if (element_nullable) { + current_levels.IncrementOptional(); + } + const auto& primitive_node = static_cast(element); + int column_index = ctx->schema->GetColumnIndex(primitive_node); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); + auto item_field = ::arrow::field(element.name(), type, element_nullable, + FieldIdMetadata(element.field_id())); + RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, + child_field)); + } else { + RETURN_NOT_OK(ValidateSupportedVectorStructNode(vector_group)); + RETURN_NOT_OK(GroupToStruct(vector_group, current_levels, ctx, out, child_field)); + } + } + + MarkVectorSubtree(child_field); + out->field = ::arrow::field( + vector_node.name(), + ::arrow::fixed_size_list(child_field->field, vector_node.vector_length()), + /*nullable=*/false, FieldIdMetadata(vector_node.field_id())); + out->level_info = vector_level; + out->is_vector = true; + return Status::OK(); +} + +Status VectorToSchemaField(const GroupNode& group, LevelInfo current_levels, + SchemaTreeContext* ctx, const SchemaField* parent, + SchemaField* out) { + if (group.is_repeated()) { + return Status::NotImplemented( + "VECTOR groups must not be repeated in the current " + "prototype"); + } + if (group.field_count() != 1) { + return Status::Invalid("VECTOR groups must have a single child"); + } + const Node& child = *group.field(0); + if (!child.is_vector()) { + return Status::Invalid("VECTOR groups must contain a VECTOR child"); + } + if (child.is_optional() || child.is_repeated()) { + return Status::Invalid("VECTOR children must not also be OPTIONAL or REPEATED"); + } + + if (group.is_optional()) { + current_levels.IncrementOptional(); + } + const LevelInfo vector_level = current_levels; + + out->children.resize(1); + SchemaField* child_field = &out->children[0]; + ctx->LinkParent(out, parent); + ctx->LinkParent(child_field, out); + + if (child.is_primitive()) { + const auto& primitive_node = static_cast(child); + int column_index = ctx->schema->GetColumnIndex(primitive_node); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); + auto item_field = ::arrow::field(child.name(), type, /*nullable=*/false, + FieldIdMetadata(child.field_id())); + RETURN_NOT_OK( + PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); + } else { + const auto& vector_group = static_cast(child); + if (vector_group.field_count() == 1) { + const Node& element = *vector_group.field(0); + if (element.is_vector()) { + RETURN_NOT_OK(VectorNodeToSchemaField(element, current_levels, ctx, out, + child_field)); + } else if (element.is_repeated()) { + return Status::Invalid("VECTOR element children must not be REPEATED"); + } else if (element.is_primitive()) { + bool element_nullable = element.is_optional(); + if (element_nullable) { + current_levels.IncrementOptional(); + } + const auto& primitive_node = static_cast(element); + int column_index = ctx->schema->GetColumnIndex(primitive_node); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); + auto item_field = ::arrow::field(element.name(), type, element_nullable, + FieldIdMetadata(element.field_id())); + RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, + child_field)); + } else { + const auto& element_group = static_cast(element); + if (element_group.is_optional() && element_group.field_count() == 1 && + element_group.field(0)->is_vector()) { + current_levels.IncrementOptional(); + RETURN_NOT_OK(VectorNodeToSchemaField(*element_group.field(0), current_levels, + ctx, out, child_field)); + child_field->field = child_field->field->WithNullable(true); + } else { + RETURN_NOT_OK(ValidateSupportedVectorStructNode(element_group)); + if (element_group.is_optional()) { + current_levels.IncrementOptional(); + } + RETURN_NOT_OK(GroupToStruct(element_group, current_levels, ctx, out, + child_field)); + } + } + } else { + for (int i = 0; i < vector_group.field_count(); ++i) { + RETURN_NOT_OK(ValidateSupportedVectorStructNode(*vector_group.field(i))); + } + RETURN_NOT_OK(GroupToStruct(vector_group, current_levels, ctx, out, child_field)); + } + } + + MarkVectorSubtree(child_field); + out->field = ::arrow::field( + group.name(), ::arrow::fixed_size_list(child_field->field, child.vector_length()), + group.is_optional(), FieldIdMetadata(group.field_id())); + out->level_info = vector_level; + out->is_vector = true; + return Status::OK(); +} + Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels, SchemaTreeContext* ctx, const SchemaField* parent, SchemaField* out) { @@ -842,6 +1170,9 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels, SchemaTreeContext* ctx, const SchemaField* parent, SchemaField* out) { + if (IsVectorGroup(node)) { + return VectorToSchemaField(node, current_levels, ctx, parent, out); + } if (node.logical_type()->is_list()) { return ListToSchemaField(node, current_levels, ctx, parent, out); } else if (node.logical_type()->is_map()) { @@ -903,6 +1234,23 @@ Status NodeToSchemaField(const Node& node, LevelInfo current_levels, int column_index = ctx->schema->GetColumnIndex(primitive_node); ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, GetTypeForNode(column_index, primitive_node, ctx)); + if (node.is_vector()) { + out->children.resize(1); + SchemaField* child_field = &out->children[0]; + ctx->LinkParent(child_field, out); + auto item_field = ::arrow::field(node.name(), type, /*nullable=*/false, + FieldIdMetadata(node.field_id())); + RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, + child_field)); + MarkVectorSubtree(child_field); + out->field = ::arrow::field( + node.name(), ::arrow::fixed_size_list(child_field->field, + primitive_node.vector_length()), + /*nullable=*/false, FieldIdMetadata(node.field_id())); + out->level_info = current_levels; + out->is_vector = true; + return Status::OK(); + } if (node.is_repeated()) { // One-level list encoding, e.g. // a: repeated int32; diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h index dd60fde43422..7ab0cc31dd05 100644 --- a/cpp/src/parquet/arrow/schema.h +++ b/cpp/src/parquet/arrow/schema.h @@ -96,6 +96,11 @@ struct PARQUET_EXPORT SchemaField { parquet::internal::LevelInfo level_info; + // True when this Arrow field is backed by a Parquet VECTOR node. VECTOR may + // be represented either directly as a primitive leaf (non-nullable elements) + // or as an intermediate VECTOR group containing a nullable element leaf. + bool is_vector = false; + bool is_leaf() const { return column_index != -1; } }; diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 2e8cf764b27f..3569d02ac2ec 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -20,6 +20,7 @@ #include "arrow/extension/json.h" #include "arrow/extension/uuid.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" @@ -37,6 +38,18 @@ using ::arrow::Result; using ::arrow::Status; using ::arrow::internal::checked_cast; +bool IsSupportedVectorElementType(const ::arrow::DataType& type) { + if (type.id() == ::arrow::Type::FIXED_SIZE_LIST) { + const auto& list_type = checked_cast(type); + return list_type.list_size() > 0 && + IsSupportedVectorElementType(*list_type.value_type()); + } + return type.id() == ::arrow::Type::STRUCT || + (!::arrow::is_nested(type) && ::arrow::is_fixed_width(type) && + type.id() != ::arrow::Type::DICTIONARY && + type.id() != ::arrow::Type::EXTENSION); +} + namespace { Result> MakeArrowDecimal(const LogicalType& logical_type, diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 09ad891aad3b..48b20338af50 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -40,4 +40,6 @@ Result> GetArrowType( const ArrowReaderProperties& reader_properties, const std::shared_ptr& metadata = nullptr); +bool IsSupportedVectorElementType(const ::arrow::DataType& type); + } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 4b2b06e5e097..1ae22f42b79a 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -20,12 +20,14 @@ #include #include #include +#include #include #include #include #include #include "arrow/array.h" +#include "arrow/array/concatenate.h" #include "arrow/extension_type.h" #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" @@ -107,6 +109,58 @@ bool HasNullableRoot(const SchemaManifest& schema_manifest, return nullable; } +int64_t CountVisitedValues(const std::vector& visited_elements) { + return std::accumulate( + visited_elements.begin(), visited_elements.end(), int64_t{0}, + [](int64_t total, const ElementRange& range) { return total + range.Size(); }); +} + +Result> MaterializeVectorLeafArray( + const MultipathLevelBuilderResult& result, ArrowWriteContext* ctx, + bool* leaf_is_nullable) { + const auto& visited_elements = result.post_list_visited_elements; + DCHECK_GT(visited_elements.size(), 0); + + // Nullable VECTOR rows still need one child slot per vector element so that the + // generic spaced leaf writer can align leaf slots with the VECTOR def levels. Arrow + // FixedSizeList already stores child slots for null parent rows, so preserve a + // zero-copy slice over the complete child range and let WriteArrow's def-level-derived + // validity bitmap suppress values belonging to null vector rows. + if (CountVisitedValues(visited_elements) == result.def_rep_level_count && + visited_elements.size() == 1) { + const ElementRange& range = visited_elements[0]; + auto values = result.leaf_array->Slice(range.start, range.Size()); + if (values->null_count() != 0) { + *leaf_is_nullable = true; + } + return values; + } + + ::arrow::ArrayVector parts; + parts.reserve(visited_elements.size() * 2 + 1); + int64_t position = 0; + for (const ElementRange& range : visited_elements) { + if (range.start > position) { + ARROW_ASSIGN_OR_RAISE( + auto null_values, + ::arrow::MakeArrayOfNull(result.leaf_array->type(), range.start - position, + ctx->memory_pool)); + parts.push_back(std::move(null_values)); + } + parts.push_back(result.leaf_array->Slice(range.start, range.Size())); + position = range.end; + } + if (position < result.def_rep_level_count) { + ARROW_ASSIGN_OR_RAISE(auto null_values, + ::arrow::MakeArrayOfNull(result.leaf_array->type(), + result.def_rep_level_count - position, + ctx->memory_pool)); + parts.push_back(std::move(null_values)); + } + *leaf_is_nullable = true; + return ::arrow::Concatenate(parts, ctx->memory_pool); +} + Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, const ArrowWriterProperties& properties, std::shared_ptr* out) { @@ -134,6 +188,51 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo return Status::OK(); } +Status ValidateVectorColumnProperties(const SchemaDescriptor* schema, + const WriterProperties& properties) { + for (int i = 0; i < schema->num_columns(); ++i) { + const ColumnDescriptor* column = schema->Column(i); + if (!column->in_vector_column()) { + continue; + } + const auto& path = column->path(); + if (properties.dictionary_enabled(path)) { + return Status::Invalid( + "Experimental VECTOR encoding does not support dictionary " + "encoding for column '", + path->ToDotString(), "'"); + } + if (properties.statistics_enabled(path)) { + return Status::Invalid( + "Experimental VECTOR encoding does not support statistics " + "for column '", + path->ToDotString(), "'"); + } + if (properties.page_index_enabled(path)) { + return Status::Invalid( + "Experimental VECTOR encoding does not support page index " + "for column '", + path->ToDotString(), "'"); + } + if (properties.bloom_filter_options(path).has_value()) { + return Status::Invalid( + "Experimental VECTOR encoding does not support bloom filters " + "for column '", + path->ToDotString(), "'"); + } + if (properties.content_defined_chunking_enabled()) { + // CDC is a file-level switch; report it against the first vector column we see + // so the user gets a clear, actionable error rather than a late ParquetException + // from WriteLevels complaining about partial vector batches. + return Status::Invalid( + "Experimental VECTOR encoding does not support content-defined chunking " + "(column '", + path->ToDotString(), "' is VECTOR-encoded)"); + } + } + return Status::OK(); +} + // Manages writing nested parquet columns with support for all nested types // supported by parquet. class ArrowColumnWriterV2 { @@ -169,17 +268,22 @@ class ArrowColumnWriterV2 { leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) { size_t visited_component_size = result.post_list_visited_elements.size(); DCHECK_GT(visited_component_size, 0); - if (visited_component_size != 1) { + std::shared_ptr values_array; + bool leaf_is_nullable = result.leaf_is_nullable; + if (result.leaf_is_vector) { + ARROW_ASSIGN_OR_RAISE(values_array, MaterializeVectorLeafArray( + result, ctx, &leaf_is_nullable)); + } else if (visited_component_size == 1) { + const ElementRange& range = result.post_list_visited_elements[0]; + values_array = result.leaf_array->Slice(range.start, range.Size()); + } else { return Status::NotImplemented( "Lists with non-zero length null components are not supported"); } - const ElementRange& range = result.post_list_visited_elements[0]; - std::shared_ptr values_array = - result.leaf_array->Slice(range.start, range.Size()); return column_writer->WriteArrow(result.def_levels, result.rep_levels, result.def_rep_level_count, *values_array, - ctx, result.leaf_is_nullable); + ctx, leaf_is_nullable); })); } @@ -201,7 +305,7 @@ class ArrowColumnWriterV2 { static ::arrow::Result> Make( const ChunkedArray& data, int64_t offset, const int64_t size, const SchemaManifest& schema_manifest, RowGroupWriter* row_group_writer, - int start_leaf_column_index = -1) { + bool write_fixed_size_list_as_vector, int start_leaf_column_index = -1) { int64_t absolute_position = 0; int chunk_index = 0; int64_t chunk_offset = 0; @@ -271,8 +375,10 @@ class ArrowColumnWriterV2 { std::shared_ptr array_to_write = chunk.Slice(chunk_offset, chunk_write_size); if (array_to_write->length() > 0) { - ARROW_ASSIGN_OR_RAISE(std::unique_ptr builder, - MultipathLevelBuilder::Make(*array_to_write, is_nullable)); + ARROW_ASSIGN_OR_RAISE( + std::unique_ptr builder, + MultipathLevelBuilder::Make(*array_to_write, is_nullable, + write_fixed_size_list_as_vector)); if (leaf_count != builder->GetLeafCount()) { return Status::UnknownError("data type leaf_count != builder_leaf_count", leaf_count, " ", builder->GetLeafCount()); @@ -328,6 +434,8 @@ class FileWriterImpl : public FileWriter { } Status Init() { + RETURN_NOT_OK( + ValidateVectorColumnProperties(writer_->schema(), *writer_->properties())); return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr, default_arrow_reader_properties(), &schema_manifest_); } @@ -375,10 +483,10 @@ class FileWriterImpl : public FileWriter { if (row_group_writer_->buffered()) { return Status::Invalid("Cannot write column chunk into the buffered row group."); } - ARROW_ASSIGN_OR_RAISE( - std::unique_ptr writer, - ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_, - row_group_writer_)); + ARROW_ASSIGN_OR_RAISE(std::unique_ptr writer, + ArrowColumnWriterV2::Make( + *data, offset, size, schema_manifest_, row_group_writer_, + arrow_properties_->write_fixed_size_list_as_vector())); return writer->Write(&column_write_context_); } @@ -459,8 +567,10 @@ class FileWriterImpl : public FileWriter { ChunkedArray chunked_array{batch.column(i)}; ARROW_ASSIGN_OR_RAISE( std::unique_ptr writer, - ArrowColumnWriterV2::Make(chunked_array, offset, size, schema_manifest_, - row_group_writer_, column_index_start)); + ArrowColumnWriterV2::Make( + chunked_array, offset, size, schema_manifest_, row_group_writer_, + arrow_properties_->write_fixed_size_list_as_vector(), + column_index_start)); column_index_start += writer->leaf_count(); if (arrow_properties_->use_threads()) { writers.emplace_back(std::move(writer)); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index b3ed46ee2d28..97f2f9a6fb35 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1185,6 +1185,38 @@ inline void DoInBatchesNonRepeated(int64_t num_levels, int64_t batch_size, } } +template +inline void DoInBatchesVectorNonRepeated(int64_t num_levels, int64_t batch_size, + int64_t max_rows_per_page, int32_t vector_length, + Action&& action, + GetBufferedRows&& curr_page_buffered_rows) { + ARROW_DCHECK_GT(vector_length, 0); + + // Most callers pass the physical number of VECTOR leaf slots here. Some writer + // paths operate in logical parent rows, though; support both so VECTOR page + // batching never splits a fixed-size vector and never aborts on a row-count input. + const bool num_levels_is_parent_rows = num_levels % vector_length != 0; + const int64_t total_rows = + num_levels_is_parent_rows ? num_levels : num_levels / vector_length; + int64_t row_offset = 0; + while (row_offset < total_rows) { + int64_t page_buffered_rows = curr_page_buffered_rows(); + ARROW_DCHECK_LE(page_buffered_rows, max_rows_per_page); + + int64_t max_batch_rows = std::max(1, batch_size / vector_length); + max_batch_rows = std::min(max_batch_rows, total_rows - row_offset); + max_batch_rows = std::min(max_batch_rows, max_rows_per_page - page_buffered_rows); + if (max_batch_rows == 0) { + max_batch_rows = 1; + } + int64_t level_offset = row_offset * vector_length; + int64_t level_count = max_batch_rows * vector_length; + + action(level_offset, level_count, /*check_page_limit=*/true); + row_offset += max_batch_rows; + } +} + // DoInBatches for repeated columns template inline void DoInBatchesRepeated(const int16_t* def_levels, const int16_t* rep_levels, @@ -1240,12 +1272,20 @@ inline void DoInBatchesRepeated(const int16_t* def_levels, const int16_t* rep_le template inline void DoInBatches(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, int64_t batch_size, int64_t max_rows_per_page, - bool pages_change_on_record_boundaries, Action&& action, + bool pages_change_on_record_boundaries, bool is_vector, + int32_t vector_length, Action&& action, GetBufferedRows&& curr_page_buffered_rows) { if (!rep_levels) { - DoInBatchesNonRepeated(num_levels, batch_size, max_rows_per_page, - std::forward(action), - std::forward(curr_page_buffered_rows)); + if (is_vector) { + DoInBatchesVectorNonRepeated( + num_levels, batch_size, max_rows_per_page, vector_length, + std::forward(action), + std::forward(curr_page_buffered_rows)); + } else { + DoInBatchesNonRepeated(num_levels, batch_size, max_rows_per_page, + std::forward(action), + std::forward(curr_page_buffered_rows)); + } } else { DoInBatchesRepeated(def_levels, rep_levels, num_levels, batch_size, max_rows_per_page, pages_change_on_record_boundaries, std::forward(action), @@ -1366,7 +1406,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, }; DoInBatches(def_levels, rep_levels, num_values, properties_->write_batch_size(), properties_->max_rows_per_page(), pages_change_on_record_boundaries(), - WriteChunk, [this]() { return num_buffered_rows_; }); + descr_->in_vector_column(), descr_->effective_vector_length(), WriteChunk, + [this]() { return num_buffered_rows_; }); return value_offset; } @@ -1417,16 +1458,20 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, }; DoInBatches(def_levels, rep_levels, num_values, properties_->write_batch_size(), properties_->max_rows_per_page(), pages_change_on_record_boundaries(), - WriteChunk, [this]() { return num_buffered_rows_; }); + descr_->in_vector_column(), descr_->effective_vector_length(), WriteChunk, + [this]() { return num_buffered_rows_; }); } Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& leaf_array, ArrowWriteContext* ctx, bool leaf_field_nullable) override { BEGIN_PARQUET_CATCH_EXCEPTIONS + const bool is_vector = descr_->in_vector_column(); // Leaf nulls are canonical when there is only a single null element after a list - // and it is at the leaf. + // and it is at the leaf. VECTOR parent nulls may also be materialized as spaced + // null slots in the leaf array, so do not treat those as single nullable elements. bool single_nullable_element = + !is_vector && (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) && leaf_field_nullable; if (!leaf_field_nullable && leaf_array.null_count() != 0) { @@ -1700,6 +1745,13 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, } WriteRepetitionLevels(num_levels, rep_levels); + } else if (descr_->in_vector_column()) { + const int32_t vector_length = descr_->effective_vector_length(); + if (vector_length <= 0 || num_levels % vector_length != 0) { + throw ParquetException("VECTOR columns must be written in whole-vector batches"); + } + rows_written_ += num_levels / vector_length; + num_buffered_rows_ += num_levels / vector_length; } else { // Each value is exactly one row rows_written_ += num_levels; @@ -1794,6 +1846,13 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, } } WriteRepetitionLevels(num_levels, rep_levels); + } else if (descr_->in_vector_column()) { + const int32_t vector_length = descr_->effective_vector_length(); + if (vector_length <= 0 || num_levels % vector_length != 0) { + throw ParquetException("VECTOR columns must be written in whole-vector batches"); + } + rows_written_ += num_levels / vector_length; + num_buffered_rows_ += num_levels / vector_length; } else { // Each value is exactly one row rows_written_ += num_levels; @@ -2079,6 +2138,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( PARQUET_CATCH_NOT_OK( DoInBatches(def_levels, rep_levels, num_levels, properties_->write_batch_size(), properties_->max_rows_per_page(), pages_change_on_record_boundaries(), + descr_->in_vector_column(), descr_->effective_vector_length(), WriteIndicesChunk, [this]() { return num_buffered_rows_; })); return Status::OK(); } @@ -2541,6 +2601,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( PARQUET_CATCH_NOT_OK( DoInBatches(def_levels, rep_levels, num_levels, properties_->write_batch_size(), properties_->max_rows_per_page(), pages_change_on_record_boundaries(), + descr_->in_vector_column(), descr_->effective_vector_length(), WriteChunk, [this]() { return num_buffered_rows_; })); return Status::OK(); } diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index e3cc5adb9648..633d55d649a5 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -190,6 +190,20 @@ enum FieldRepetitionType { /** The field is repeated and can contain 0 or more values */ REPEATED = 2; + + /** + * Experimental Option B prototype for fixed-size vector/list values. + * + * This field repeats vector_length times per parent value without increasing the + * maximum definition or repetition level of its descendants. VECTOR-aware + * readers use the fixed multiplicity from the schema to group the physical + * child values back into one logical vector value. Vector-level nullability is + * represented by an OPTIONAL parent node. + * + * Readers that do not understand VECTOR are expected to reject the file rather + * than attempting a LIST fallback. + */ + VECTOR = 3; } /** @@ -557,6 +571,13 @@ struct SchemaElement { * for some logical types to ensure forward-compatibility in format v1. */ 10: optional LogicalType logicalType + + /** + * Required when repetition_type is VECTOR. Must be positive. For nested VECTOR + * nodes, the effective number of physical leaf values per parent is the product + * of vector_length along the leaf's VECTOR ancestors. + */ + 12: optional i32 vector_length; } /** diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 6634bac4f684..cf6813e06438 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -1298,6 +1298,7 @@ class PARQUET_EXPORT ArrowWriterProperties { truncated_timestamps_allowed_(false), store_schema_(false), compliant_nested_types_(true), + write_fixed_size_list_as_vector_(false), engine_version_(V2), use_threads_(kArrowDefaultUseThreads), executor_(NULLPTR), @@ -1367,6 +1368,12 @@ class PARQUET_EXPORT ArrowWriterProperties { return this; } + /// \brief EXPERIMENTAL: encode Arrow FixedSizeList as Parquet VECTOR. + Builder* enable_experimental_vector_encoding() { + write_fixed_size_list_as_vector_ = true; + return this; + } + /// Set the version of the Parquet writer engine. Builder* set_engine_version(EngineVersion version) { engine_version_ = version; @@ -1409,7 +1416,8 @@ class PARQUET_EXPORT ArrowWriterProperties { return std::shared_ptr(new ArrowWriterProperties( write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, - engine_version_, use_threads_, executor_, write_time_adjusted_to_utc_)); + write_fixed_size_list_as_vector_, engine_version_, use_threads_, executor_, + write_time_adjusted_to_utc_)); } private: @@ -1421,6 +1429,7 @@ class PARQUET_EXPORT ArrowWriterProperties { bool store_schema_; bool compliant_nested_types_; + bool write_fixed_size_list_as_vector_; EngineVersion engine_version_; bool use_threads_; @@ -1447,6 +1456,10 @@ class PARQUET_EXPORT ArrowWriterProperties { /// "element". bool compliant_nested_types() const { return compliant_nested_types_; } + bool write_fixed_size_list_as_vector() const { + return write_fixed_size_list_as_vector_; + } + /// \brief The underlying engine version to use when writing Arrow data. /// /// V2 is currently the latest V1 is considered deprecated but left in @@ -1471,6 +1484,7 @@ class PARQUET_EXPORT ArrowWriterProperties { ::arrow::TimeUnit::type coerce_timestamps_unit, bool truncated_timestamps_allowed, bool store_schema, bool compliant_nested_types, + bool write_fixed_size_list_as_vector, EngineVersion engine_version, bool use_threads, ::arrow::internal::Executor* executor, bool write_time_adjusted_to_utc) @@ -1480,6 +1494,7 @@ class PARQUET_EXPORT ArrowWriterProperties { truncated_timestamps_allowed_(truncated_timestamps_allowed), store_schema_(store_schema), compliant_nested_types_(compliant_nested_types), + write_fixed_size_list_as_vector_(write_fixed_size_list_as_vector), engine_version_(engine_version), use_threads_(use_threads), executor_(executor), @@ -1491,6 +1506,7 @@ class PARQUET_EXPORT ArrowWriterProperties { const bool truncated_timestamps_allowed_; const bool store_schema_; const bool compliant_nested_types_; + const bool write_fixed_size_list_as_vector_; const EngineVersion engine_version_; const bool use_threads_; ::arrow::internal::Executor* executor_; diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 0cfa49c21c16..04f829e6e50e 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -50,6 +50,16 @@ void CheckColumnBounds(int column_index, size_t max_columns) { } } +void ValidateVectorProperties(Repetition::type repetition, int32_t vector_length) { + if (repetition == Repetition::VECTOR) { + if (vector_length <= 0) { + throw ParquetException("VECTOR nodes must specify a positive vector_length"); + } + } else if (vector_length != -1) { + throw ParquetException("Only VECTOR nodes may specify vector_length"); + } +} + } // namespace namespace schema { @@ -117,7 +127,7 @@ const std::shared_ptr Node::path() const { bool Node::EqualsInternal(const Node* other) const { return type_ == other->type_ && name_ == other->name_ && repetition_ == other->repetition_ && converted_type_ == other->converted_type_ && - field_id_ == other->field_id() && + field_id_ == other->field_id() && vector_length_ == other->vector_length() && logical_type_->Equals(*(other->logical_type())); } @@ -128,10 +138,12 @@ void Node::SetParent(const Node* parent) { parent_ = parent; } PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, ConvertedType::type converted_type, - int length, int precision, int scale, int id) - : Node(Node::PRIMITIVE, name, repetition, converted_type, id), + int length, int precision, int scale, int id, + int32_t vector_length) + : Node(Node::PRIMITIVE, name, repetition, converted_type, id, vector_length), physical_type_(type), type_length_(length) { + ValidateVectorProperties(repetition, vector_length); std::stringstream ss; // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being @@ -241,10 +253,12 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition, std::shared_ptr logical_type, - Type::type physical_type, int physical_length, int id) - : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id), + Type::type physical_type, int physical_length, int id, + int32_t vector_length) + : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id, vector_length), physical_type_(physical_type), type_length_(physical_length) { + ValidateVectorProperties(repetition, vector_length); std::stringstream error; if (logical_type_) { // Check for logical type <=> node type consistency @@ -315,8 +329,11 @@ void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const { // Group node GroupNode::GroupNode(const std::string& name, Repetition::type repetition, - const NodeVector& fields, ConvertedType::type converted_type, int id) - : Node(Node::GROUP, name, repetition, converted_type, id), fields_(fields) { + const NodeVector& fields, ConvertedType::type converted_type, int id, + int32_t vector_length) + : Node(Node::GROUP, name, repetition, converted_type, id, vector_length), + fields_(fields) { + ValidateVectorProperties(repetition, vector_length); // For forward compatibility, create an equivalent logical type logical_type_ = LogicalType::FromConvertedType(converted_type_); if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) && @@ -334,8 +351,11 @@ GroupNode::GroupNode(const std::string& name, Repetition::type repetition, GroupNode::GroupNode(const std::string& name, Repetition::type repetition, const NodeVector& fields, - std::shared_ptr logical_type, int id) - : Node(Node::GROUP, name, repetition, std::move(logical_type), id), fields_(fields) { + std::shared_ptr logical_type, int id, + int32_t vector_length) + : Node(Node::GROUP, name, repetition, std::move(logical_type), id, vector_length), + fields_(fields) { + ValidateVectorProperties(repetition, vector_length); if (logical_type_) { // Check for logical type <=> node type consistency if (logical_type_->is_nested()) { @@ -423,19 +443,23 @@ std::unique_ptr GroupNode::FromParquet(const void* opaque_element, if (element->__isset.field_id) { field_id = element->field_id; } + int32_t vector_length = -1; + if (element->__isset.vector_length) { + vector_length = element->vector_length; + } std::unique_ptr group_node; if (element->__isset.logicalType) { // updated writer with logical type present - group_node = std::unique_ptr( - new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields, - LogicalType::FromThrift(element->logicalType), field_id)); + group_node = std::unique_ptr(new GroupNode( + element->name, LoadEnumSafe(&element->repetition_type), fields, + LogicalType::FromThrift(element->logicalType), field_id, vector_length)); } else { group_node = std::unique_ptr(new GroupNode( element->name, LoadEnumSafe(&element->repetition_type), fields, (element->__isset.converted_type ? LoadEnumSafe(&element->converted_type) : ConvertedType::NONE), - field_id)); + field_id, vector_length)); } return std::unique_ptr(group_node.release()); @@ -449,25 +473,30 @@ std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element) { if (element->__isset.field_id) { field_id = element->field_id; } + int32_t vector_length = -1; + if (element->__isset.vector_length) { + vector_length = element->vector_length; + } std::unique_ptr primitive_node; if (element->__isset.logicalType) { // updated writer with logical type present - primitive_node = std::unique_ptr( - new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type), - LogicalType::FromThrift(element->logicalType), - LoadEnumSafe(&element->type), element->type_length, field_id)); - } else if (element->__isset.converted_type) { - // legacy writer with converted type present primitive_node = std::unique_ptr(new PrimitiveNode( element->name, LoadEnumSafe(&element->repetition_type), - LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type), - element->type_length, element->precision, element->scale, field_id)); + LogicalType::FromThrift(element->logicalType), LoadEnumSafe(&element->type), + element->type_length, field_id, vector_length)); + } else if (element->__isset.converted_type) { + // legacy writer with converted type present + primitive_node = std::unique_ptr( + new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type), + LoadEnumSafe(&element->type), + LoadEnumSafe(&element->converted_type), element->type_length, + element->precision, element->scale, field_id, vector_length)); } else { // logical type not present primitive_node = std::unique_ptr(new PrimitiveNode( element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(), - LoadEnumSafe(&element->type), element->type_length, field_id)); + LoadEnumSafe(&element->type), element->type_length, field_id, vector_length)); } // Return as unique_ptr to the base type @@ -499,6 +528,9 @@ void GroupNode::ToParquet(void* opaque_element) const { if (field_id_ >= 0) { element->__set_field_id(field_id_); } + if (is_vector()) { + element->__set_vector_length(vector_length_); + } if (logical_type_ && logical_type_->is_serialized()) { element->__set_logicalType(logical_type_->ToThrift()); } @@ -524,6 +556,9 @@ void PrimitiveNode::ToParquet(void* opaque_element) const { if (field_id_ >= 0) { element->__set_field_id(field_id_); } + if (is_vector()) { + element->__set_vector_length(vector_length_); + } if (logical_type_ && logical_type_->is_serialized() && // TODO(tpboudreau): remove the following conjunct to enable serialization // of IntervalTypes after parquet.thrift recognizes them @@ -634,6 +669,9 @@ static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) { case Repetition::REPEATED: stream << "repeated"; break; + case Repetition::VECTOR: + stream << "vector"; + break; default: break; } @@ -710,14 +748,19 @@ struct SchemaPrinter : public Node::ConstVisitor { stream_ << " "; PrintType(node, stream_); stream_ << " field_id=" << node->field_id() << " " << node->name(); + if (node->is_vector()) { + stream_ << " [" << node->vector_length() << "]"; + } PrintConvertedType(node, stream_); stream_ << ";" << std::endl; } void Visit(const GroupNode* node) { PrintRepLevel(node->repetition(), stream_); - stream_ << " group " - << "field_id=" << node->field_id() << " " << node->name(); + stream_ << " group " << "field_id=" << node->field_id() << " " << node->name(); + if (node->is_vector()) { + stream_ << " [" << node->vector_length() << "]"; + } auto lt = node->converted_type(); const auto& la = node->logical_type(); if (la && la->is_valid() && !la->is_none()) { @@ -836,6 +879,9 @@ void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, // between an empty list and a list with an item in it. ++max_rep_level; ++max_def_level; + } else if (node->is_vector()) { + // VECTOR fields repeat a fixed number of times per parent value without + // increasing the maximum definition or repetition level. } // Now, walk the schema and create a ColumnDescriptor for each leaf node @@ -848,8 +894,23 @@ void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, node_to_leaf_index_[static_cast(node.get())] = static_cast(leaves_.size()); + // Determine the product of all VECTOR-repeated ancestors (including this leaf) + // so nested VECTOR shapes such as vector<3, vector<4, int32>> report the total + // number of physical leaf values contributed per parent record. + int32_t effective_vector_length = -1; + for (const Node* cursor = node.get(); cursor != nullptr; cursor = cursor->parent()) { + if (cursor->is_vector()) { + if (effective_vector_length < 0) { + effective_vector_length = cursor->vector_length(); + } else { + effective_vector_length *= cursor->vector_length(); + } + } + } + // Primitive node, append to leaves - leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this)); + leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this, + effective_vector_length)); leaf_to_base_.emplace(static_cast(leaves_.size()) - 1, base); leaf_to_idx_.emplace(node->path()->ToDotString(), static_cast(leaves_.size()) - 1); @@ -866,10 +927,12 @@ int SchemaDescriptor::GetColumnIndex(const PrimitiveNode& node) const { ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level, int16_t max_repetition_level, - const SchemaDescriptor* schema_descr) + const SchemaDescriptor* schema_descr, + int32_t effective_vector_length) : node_(std::move(node)), max_definition_level_(max_definition_level), - max_repetition_level_(max_repetition_level) { + max_repetition_level_(max_repetition_level), + effective_vector_length_(effective_vector_length) { if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); } @@ -879,7 +942,8 @@ ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_ bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const { return primitive_node_->Equals(other.primitive_node_) && max_repetition_level() == other.max_repetition_level() && - max_definition_level() == other.max_definition_level(); + max_definition_level() == other.max_definition_level() && + effective_vector_length() == other.effective_vector_length(); } const ColumnDescriptor* SchemaDescriptor::Column(int i) const { @@ -930,8 +994,13 @@ std::string ColumnDescriptor::ToString() const { << " physical_type: " << TypeToString(physical_type()) << "," << std::endl << " converted_type: " << ConvertedTypeToString(converted_type()) << "," << std::endl - << " logical_type: " << logical_type()->ToString() << "," << std::endl - << " max_definition_level: " << max_definition_level() << "," << std::endl + << " logical_type: " << logical_type()->ToString() << "," << std::endl; + + if (schema_node()->is_vector()) { + ss << " vector_length: " << schema_node()->vector_length() << "," << std::endl; + } + + ss << " max_definition_level: " << max_definition_level() << "," << std::endl << " max_repetition_level: " << max_repetition_level() << "," << std::endl; if (physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 1addc73bd367..3b21cb6a40dd 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -109,6 +109,11 @@ class PARQUET_EXPORT Node { bool is_repeated() const { return repetition_ == Repetition::REPEATED; } + /// \brief True when this node specifically has VECTOR repetition; use + /// ColumnDescriptor::in_vector_column() to ask whether a column is part of + /// a VECTOR subtree. + bool is_vector() const { return repetition_ == Repetition::VECTOR; } + bool is_required() const { return repetition_ == Repetition::REQUIRED; } virtual bool Equals(const Node* other) const = 0; @@ -128,6 +133,10 @@ class PARQUET_EXPORT Node { /// Thrift. int field_id() const { return field_id_; } + /// \brief The fixed number of values per parent when repetition == VECTOR. + /// Returns -1 when this node is not VECTOR-repeated. + int32_t vector_length() const { return vector_length_; } + const Node* parent() const { return parent_; } const std::shared_ptr path() const; @@ -155,21 +164,25 @@ class PARQUET_EXPORT Node { friend class GroupNode; Node(Node::type type, const std::string& name, Repetition::type repetition, - ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1) + ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1, + int32_t vector_length = -1) : type_(type), name_(name), repetition_(repetition), converted_type_(converted_type), field_id_(field_id), + vector_length_(vector_length), parent_(NULLPTR) {} Node(Node::type type, const std::string& name, Repetition::type repetition, - std::shared_ptr logical_type, int field_id = -1) + std::shared_ptr logical_type, int field_id = -1, + int32_t vector_length = -1) : type_(type), name_(name), repetition_(repetition), logical_type_(std::move(logical_type)), field_id_(field_id), + vector_length_(vector_length), parent_(NULLPTR) {} Node::type type_; @@ -178,6 +191,7 @@ class PARQUET_EXPORT Node { ConvertedType::type converted_type_{ConvertedType::NONE}; std::shared_ptr logical_type_; int field_id_; + int32_t vector_length_; // Nodes should not be shared, they have a single parent. const Node* parent_; @@ -205,9 +219,9 @@ class PARQUET_EXPORT PrimitiveNode : public Node { Type::type type, ConvertedType::type converted_type = ConvertedType::NONE, int length = -1, int precision = -1, int scale = -1, - int field_id = -1) { + int field_id = -1, int32_t vector_length = -1) { return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length, - precision, scale, field_id)); + precision, scale, field_id, vector_length)); } // If no logical type, pass LogicalType::None() or nullptr @@ -215,9 +229,10 @@ class PARQUET_EXPORT PrimitiveNode : public Node { static inline NodePtr Make(const std::string& name, Repetition::type repetition, std::shared_ptr logical_type, Type::type primitive_type, int primitive_length = -1, - int field_id = -1) { + int field_id = -1, int32_t vector_length = -1) { return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type), - primitive_type, primitive_length, field_id)); + primitive_type, primitive_length, field_id, + vector_length)); } bool Equals(const Node* other) const override; @@ -239,11 +254,13 @@ class PARQUET_EXPORT PrimitiveNode : public Node { private: PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, ConvertedType::type converted_type = ConvertedType::NONE, int length = -1, - int precision = -1, int scale = -1, int field_id = -1); + int precision = -1, int scale = -1, int field_id = -1, + int32_t vector_length = -1); PrimitiveNode(const std::string& name, Repetition::type repetition, std::shared_ptr logical_type, - Type::type primitive_type, int primitive_length = -1, int field_id = -1); + Type::type primitive_type, int primitive_length = -1, int field_id = -1, + int32_t vector_length = -1); Type::type physical_type_; int32_t type_length_; @@ -270,8 +287,9 @@ class PARQUET_EXPORT GroupNode : public Node { static inline NodePtr Make(const std::string& name, Repetition::type repetition, const NodeVector& fields, ConvertedType::type converted_type = ConvertedType::NONE, - int field_id = -1) { - return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id)); + int field_id = -1, int32_t vector_length = -1) { + return NodePtr( + new GroupNode(name, repetition, fields, converted_type, field_id, vector_length)); } // If no logical type, pass nullptr @@ -279,9 +297,9 @@ class PARQUET_EXPORT GroupNode : public Node { static inline NodePtr Make(const std::string& name, Repetition::type repetition, const NodeVector& fields, std::shared_ptr logical_type, - int field_id = -1) { - return NodePtr( - new GroupNode(name, repetition, fields, std::move(logical_type), field_id)); + int field_id = -1, int32_t vector_length = -1) { + return NodePtr(new GroupNode(name, repetition, fields, std::move(logical_type), + field_id, vector_length)); } bool Equals(const Node* other) const override; @@ -307,11 +325,12 @@ class PARQUET_EXPORT GroupNode : public Node { private: GroupNode(const std::string& name, Repetition::type repetition, const NodeVector& fields, - ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1); + ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1, + int32_t vector_length = -1); GroupNode(const std::string& name, Repetition::type repetition, const NodeVector& fields, std::shared_ptr logical_type, - int field_id = -1); + int field_id = -1, int32_t vector_length = -1); NodeVector fields_; bool EqualsInternal(const GroupNode* other) const; @@ -358,7 +377,8 @@ class PARQUET_EXPORT ColumnDescriptor { public: ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level, int16_t max_repetition_level, - const SchemaDescriptor* schema_descr = NULLPTR); + const SchemaDescriptor* schema_descr = NULLPTR, + int32_t effective_vector_length = -1); bool Equals(const ColumnDescriptor& other) const; @@ -366,6 +386,16 @@ class PARQUET_EXPORT ColumnDescriptor { int16_t max_repetition_level() const { return max_repetition_level_; } + /// \brief The fixed number of leaf values per parent record contributed by + /// this column, computed as the product of all VECTOR-repeated ancestors + /// (including the leaf itself if it is VECTOR-repeated). Returns -1 when the + /// column is not part of a VECTOR-repeated subtree. + int32_t effective_vector_length() const { return effective_vector_length_; } + + /// \brief True when this column belongs to a VECTOR-repeated subtree + /// (the leaf or some ancestor has Repetition::VECTOR). + bool in_vector_column() const { return effective_vector_length_ > 0; } + Type::type physical_type() const { return primitive_node_->physical_type(); } ConvertedType::type converted_type() const { return primitive_node_->converted_type(); } @@ -402,6 +432,7 @@ class PARQUET_EXPORT ColumnDescriptor { int16_t max_definition_level_; int16_t max_repetition_level_; + int32_t effective_vector_length_; }; // Container for the converted Parquet schema with a computed information from diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index 2950a7df70f8..62433f9c3cc3 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -43,7 +43,8 @@ namespace schema { static inline SchemaElement NewPrimitive(const std::string& name, FieldRepetitionType::type repetition, - Type::type type, int field_id = -1) { + Type::type type, int field_id = -1, + int32_t vector_length = -1) { SchemaElement result; result.__set_name(name); result.__set_repetition_type(repetition); @@ -51,12 +52,16 @@ static inline SchemaElement NewPrimitive(const std::string& name, if (field_id >= 0) { result.__set_field_id(field_id); } + if (vector_length >= 0) { + result.__set_vector_length(vector_length); + } return result; } static inline SchemaElement NewGroup(const std::string& name, FieldRepetitionType::type repetition, - int num_children, int field_id = -1) { + int num_children, int field_id = -1, + int32_t vector_length = -1) { SchemaElement result; result.__set_name(name); result.__set_repetition_type(repetition); @@ -65,6 +70,9 @@ static inline SchemaElement NewGroup(const std::string& name, if (field_id >= 0) { result.__set_field_id(field_id); } + if (vector_length >= 0) { + result.__set_vector_length(vector_length); + } return result; } @@ -217,6 +225,30 @@ TEST_F(TestPrimitiveNode, FromParquet) { ASSERT_EQ(12, prim_node_->decimal_metadata().precision); } +TEST_F(TestPrimitiveNode, VectorFromParquet) { + SchemaElement elt = + NewPrimitive(name_, FieldRepetitionType::VECTOR, Type::FLOAT, field_id_, 8); + + ASSERT_NO_FATAL_FAILURE(Convert(&elt)); + ASSERT_EQ(name_, prim_node_->name()); + ASSERT_EQ(field_id_, prim_node_->field_id()); + ASSERT_EQ(Repetition::VECTOR, prim_node_->repetition()); + ASSERT_TRUE(prim_node_->is_vector()); + ASSERT_EQ(8, prim_node_->vector_length()); + ASSERT_EQ(Type::FLOAT, prim_node_->physical_type()); +} + +TEST_F(TestPrimitiveNode, VectorValidation) { + ASSERT_THROW(PrimitiveNode::Make("vec", Repetition::VECTOR, Type::FLOAT), + ParquetException); + ASSERT_THROW(PrimitiveNode::Make("scalar", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, 4), + ParquetException); + ASSERT_THROW(PrimitiveNode::Make("empty", Repetition::VECTOR, Type::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, 0), + ParquetException); +} + TEST_F(TestPrimitiveNode, Equals) { PrimitiveNode node1("foo", Repetition::REQUIRED, Type::INT32); PrimitiveNode node2("foo", Repetition::REQUIRED, Type::INT64); @@ -365,6 +397,16 @@ TEST_F(TestGroupNode, Attrs) { ASSERT_EQ(ConvertedType::LIST, node2.converted_type()); } +TEST_F(TestGroupNode, VectorAttrs) { + auto node = GroupNode::Make("vec", Repetition::VECTOR, Fields1(), ConvertedType::NONE, + /*field_id=*/-1, + /*vector_length=*/4); + + ASSERT_TRUE(node->is_vector()); + ASSERT_EQ(Repetition::VECTOR, node->repetition()); + ASSERT_EQ(4, node->vector_length()); +} + TEST_F(TestGroupNode, Equals) { NodeVector f1 = Fields1(); NodeVector f2 = Fields1(); @@ -799,6 +841,39 @@ TEST_F(TestSchemaDescriptor, BuildTree) { ASSERT_EQ(nleaves, descr_.num_columns()); } +TEST_F(TestSchemaDescriptor, BuildTreeVector) { + NodePtr element = PrimitiveNode::Make("element", Repetition::VECTOR, Type::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, 3); + NodePtr embedding = GroupNode::Make("embedding", Repetition::OPTIONAL, {element}); + + descr_.Init(GroupNode::Make("schema", Repetition::REPEATED, {embedding})); + + ASSERT_EQ(1, descr_.num_columns()); + const ColumnDescriptor* col = descr_.Column(0); + EXPECT_EQ(1, col->max_definition_level()); + EXPECT_EQ(0, col->max_repetition_level()); + EXPECT_TRUE(col->schema_node()->is_vector()); + EXPECT_EQ(3, col->schema_node()->vector_length()); + EXPECT_EQ(3, col->effective_vector_length()); +} + +TEST_F(TestSchemaDescriptor, BuildTreeNestedVector) { + NodePtr element = PrimitiveNode::Make("element", Repetition::VECTOR, Type::FLOAT, + ConvertedType::NONE, -1, -1, -1, -1, 4); + NodePtr inner = GroupNode::Make("inner", Repetition::VECTOR, {element}, + /*converted_type=*/ConvertedType::NONE, + /*field_id=*/-1, /*vector_length=*/3); + NodePtr embedding = GroupNode::Make("embedding", Repetition::OPTIONAL, {inner}); + + descr_.Init(GroupNode::Make("schema", Repetition::REPEATED, {embedding})); + + ASSERT_EQ(1, descr_.num_columns()); + const ColumnDescriptor* col = descr_.Column(0); + EXPECT_EQ(1, col->max_definition_level()); + EXPECT_EQ(0, col->max_repetition_level()); + EXPECT_EQ(12, col->effective_vector_length()); +} + TEST_F(TestSchemaDescriptor, HasRepeatedFields) { NodeVector fields; NodePtr schema; @@ -886,6 +961,28 @@ TEST(TestSchemaPrinter, Examples) { ASSERT_EQ(expected, result); } +TEST(TestSchemaPrinter, Vector) { + NodePtr schema = GroupNode::Make( + "schema", Repetition::REPEATED, + {GroupNode::Make( + "embedding", Repetition::OPTIONAL, + {GroupNode::Make( + "vector", Repetition::VECTOR, + {PrimitiveNode::Make("element", Repetition::REQUIRED, Type::FLOAT)}, + /*logical_type=*/nullptr, -1, 3)}, + /*logical_type=*/nullptr)}); + + std::string expected = R"(repeated group field_id=-1 schema { + optional group field_id=-1 embedding { + vector group field_id=-1 vector [3] { + required float field_id=-1 element; + } + } +} +)"; + ASSERT_EQ(expected, Print(schema)); +} + static void ConfirmFactoryEquivalence( ConvertedType::type converted_type, const std::shared_ptr& from_make, diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a7544..d4dd9af0c070 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -758,6 +758,7 @@ class LogicalType::Impl { class Geometry; class Geography; class Variant; + class Vector; class No; class Undefined; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index ad4df5119e75..3d639383e4ee 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -112,7 +112,13 @@ class LogicalType; // Mirrors parquet::FieldRepetitionType struct Repetition { - enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 }; + enum type { + REQUIRED = 0, + OPTIONAL = 1, + REPEATED = 2, + VECTOR = 3, + /*Always last*/ UNDEFINED = 4 + }; }; // Reference: @@ -231,6 +237,7 @@ class PARQUET_EXPORT LogicalType { static std::shared_ptr Variant( int8_t specVersion = kVariantSpecVersion); + static std::shared_ptr Geometry(std::string crs = ""); static std::shared_ptr Geography( From d0f7a1f8bbbf0ff4f1d9e373006295920bb54355 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 29 May 2026 16:23:10 +0200 Subject: [PATCH 2/5] work --- .../parquet/arrow/arrow_reader_writer_test.cc | 204 +++++++++++++++++- cpp/src/parquet/arrow/writer.cc | 42 +--- cpp/src/parquet/column_writer.cc | 25 +++ 3 files changed, 220 insertions(+), 51 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 62044274ad63..40595938be7f 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -67,6 +67,8 @@ #include "parquet/api/reader.h" #include "parquet/api/writer.h" +#include "parquet/bloom_filter.h" +#include "parquet/bloom_filter_reader.h" #include "parquet/arrow/fuzz_internal.h" #include "parquet/arrow/reader.h" @@ -76,6 +78,7 @@ #include "parquet/arrow/writer.h" #include "parquet/column_writer.h" #include "parquet/file_writer.h" +#include "parquet/page_index.h" #include "parquet/properties.h" #include "parquet/test_util.h" #include "parquet/types.h" @@ -426,18 +429,13 @@ void WriteTableToBuffer(const std::shared_ptr
& table, int64_t row_group_s std::shared_ptr VectorWriterProperties() { auto builder = WriterProperties::Builder(); - return builder.disable_dictionary() - ->disable_statistics() - ->disable_write_page_index() - ->encoding(Encoding::PLAIN) - ->build(); + return builder.enable_write_page_index()->encoding(Encoding::PLAIN)->build(); } std::shared_ptr VectorByteStreamSplitWriterProperties() { auto builder = WriterProperties::Builder(); return builder.disable_dictionary() - ->disable_statistics() - ->disable_write_page_index() + ->enable_write_page_index() ->encoding(Encoding::BYTE_STREAM_SPLIT) ->build(); } @@ -4064,7 +4062,7 @@ TEST(ArrowReadWrite, FixedSizeListVectorNullableNestedStructElementsRoundTrip) { ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } -TEST(ArrowWriteOnly, FixedSizeListVectorRejectsDefaultWriterProperties) { +TEST(ArrowWriteOnly, FixedSizeListVectorDefaultWriterProperties) { auto table = MakeVectorFixedSizeListTable(R"([ [1, 2, 3], [4, 5, 6]])", @@ -4072,9 +4070,193 @@ TEST(ArrowWriteOnly, FixedSizeListVectorRejectsDefaultWriterProperties) { ArrowWriterProperties::Builder builder; builder.enable_experimental_vector_encoding(); - ASSERT_RAISES(Invalid, - WriteTableToBuffer(table, /*row_group_size=*/2, - default_writer_properties(), builder.build())); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/2, + default_writer_properties(), builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + const auto encodings = + parquet_reader->metadata()->RowGroup(0)->ColumnChunk(0)->encodings(); + EXPECT_NE(std::find(encodings.begin(), encodings.end(), Encoding::RLE_DICTIONARY), + encodings.end()); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorElementStatistics) { + auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ + [1, 2, 3], + [4, 5, 6]])", + /*nullable=*/false); + + auto writer_properties = WriterProperties::Builder() + .disable_dictionary() + ->encoding(Encoding::PLAIN) + ->build(); + ArrowWriterProperties::Builder arrow_builder; + arrow_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, + writer_properties, + arrow_builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + auto column = parquet_reader->metadata()->RowGroup(0)->ColumnChunk(0); + ASSERT_TRUE(column->is_stats_set()); + auto statistics = column->statistics(); + ASSERT_EQ(6, statistics->num_values()); + ASSERT_EQ(0, statistics->null_count()); + + std::shared_ptr min, max; + ASSERT_OK(StatisticsAsScalars(*statistics, &min, &max)); + ASSERT_OK_AND_ASSIGN(auto expected_min, ::arrow::MakeScalar(::arrow::int32(), 1)); + ASSERT_OK_AND_ASSIGN(auto expected_max, ::arrow::MakeScalar(::arrow::int32(), 6)); + ::arrow::AssertScalarsEqual(*expected_min, *min, /*verbose=*/true); + ::arrow::AssertScalarsEqual(*expected_max, *max, /*verbose=*/true); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorNullableElementStatistics) { + auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ + [1, 2, 3], + null, + [4, null, 6]])", + /*nullable=*/true, + /*element_nullable=*/true); + + auto writer_properties = WriterProperties::Builder() + .disable_dictionary() + ->encoding(Encoding::PLAIN) + ->build(); + ArrowWriterProperties::Builder arrow_builder; + arrow_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/3, + writer_properties, + arrow_builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + auto column = parquet_reader->metadata()->RowGroup(0)->ColumnChunk(0); + ASSERT_TRUE(column->is_stats_set()); + auto statistics = column->statistics(); + ASSERT_EQ(5, statistics->num_values()); + ASSERT_EQ(4, statistics->null_count()); + + std::shared_ptr min, max; + ASSERT_OK(StatisticsAsScalars(*statistics, &min, &max)); + ASSERT_OK_AND_ASSIGN(auto expected_min, ::arrow::MakeScalar(::arrow::int32(), 1)); + ASSERT_OK_AND_ASSIGN(auto expected_max, ::arrow::MakeScalar(::arrow::int32(), 6)); + ::arrow::AssertScalarsEqual(*expected_min, *min, /*verbose=*/true); + ::arrow::AssertScalarsEqual(*expected_max, *max, /*verbose=*/true); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorPageIndex) { + auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9]])", + /*nullable=*/false); + + auto writer_properties = WriterProperties::Builder() + .disable_dictionary() + ->encoding(Encoding::PLAIN) + ->enable_write_page_index() + ->build(); + ArrowWriterProperties::Builder arrow_builder; + arrow_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, + writer_properties, + arrow_builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + auto page_index_reader = parquet_reader->GetPageIndexReader(); + ASSERT_NE(nullptr, page_index_reader); + auto row_group_index_reader = page_index_reader->RowGroup(0); + ASSERT_NE(nullptr, row_group_index_reader); + ASSERT_NE(nullptr, row_group_index_reader->GetColumnIndex(0)); + ASSERT_NE(nullptr, row_group_index_reader->GetOffsetIndex(0)); +} + +TEST(ArrowWriteOnly, FixedSizeListVectorBloomFilter) { + auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ + [1, 2, 3], + [4, 5, 6]])", + /*nullable=*/false); + + BloomFilterOptions bloom_filter_options{10, 0.05}; + auto writer_properties = WriterProperties::Builder() + .disable_dictionary() + ->encoding(Encoding::PLAIN) + ->enable_bloom_filter("root.element", bloom_filter_options) + ->build(); + ArrowWriterProperties::Builder arrow_builder; + arrow_builder.enable_experimental_vector_encoding(); + ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, + writer_properties, + arrow_builder.build())); + + auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); + auto& bloom_filter_reader = parquet_reader->GetBloomFilterReader(); + auto row_group_bloom_filter = bloom_filter_reader.RowGroup(0); + ASSERT_NE(nullptr, row_group_bloom_filter); + auto bloom_filter = row_group_bloom_filter->GetColumnBloomFilter(0); + ASSERT_NE(nullptr, bloom_filter); + + int32_t present = 4; + EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(present))); +} + +TEST(ArrowReadWrite, FixedSizeListVectorContentDefinedChunkingRoundTrip) { + std::stringstream json; + json << "["; + for (int i = 0; i < 40; ++i) { + if (i != 0) { + json << ","; + } + json << "[" << (3 * i + 1) << "," << (3 * i + 2) << "," << (3 * i + 3) + << "]"; + } + json << "]"; + auto table = MakeVectorFixedSizeListTable(::arrow::int32(), json.str(), + /*nullable=*/false); + + CdcOptions cdc_options; + cdc_options.min_chunk_size = 64; + cdc_options.max_chunk_size = 128; + auto writer_properties = WriterProperties::Builder() + .enable_content_defined_chunking() + ->content_defined_chunking_options(cdc_options) + ->build(); + ArrowWriterProperties::Builder arrow_builder; + arrow_builder.enable_experimental_vector_encoding(); + + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoRoundtrip(table, /*row_group_size=*/40, &result, + writer_properties, arrow_builder.build())); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), false); + ::arrow::AssertTablesEqual(*table, *result, false); +} + +TEST(ArrowReadWrite, FixedSizeListVectorContentDefinedChunkingNullableRoundTrip) { + auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ + [1, 2, 3], + null, + [4, null, 6], + [7, 8, 9]])", + /*nullable=*/true, + /*element_nullable=*/true); + + CdcOptions cdc_options; + cdc_options.min_chunk_size = 64; + cdc_options.max_chunk_size = 128; + auto writer_properties = WriterProperties::Builder() + .disable_dictionary() + ->enable_content_defined_chunking() + ->content_defined_chunking_options(cdc_options) + ->build(); + ArrowWriterProperties::Builder arrow_builder; + arrow_builder.enable_experimental_vector_encoding(); + + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoRoundtrip(table, /*row_group_size=*/4, &result, + writer_properties, arrow_builder.build())); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), false); + ::arrow::AssertTablesEqual(*table, *result, false); } TEST(ArrowReadWrite, FixedSizeListVectorByteStreamSplitRoundTrip) { diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 1ae22f42b79a..06a6f681e649 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -190,46 +190,8 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo Status ValidateVectorColumnProperties(const SchemaDescriptor* schema, const WriterProperties& properties) { - for (int i = 0; i < schema->num_columns(); ++i) { - const ColumnDescriptor* column = schema->Column(i); - if (!column->in_vector_column()) { - continue; - } - const auto& path = column->path(); - if (properties.dictionary_enabled(path)) { - return Status::Invalid( - "Experimental VECTOR encoding does not support dictionary " - "encoding for column '", - path->ToDotString(), "'"); - } - if (properties.statistics_enabled(path)) { - return Status::Invalid( - "Experimental VECTOR encoding does not support statistics " - "for column '", - path->ToDotString(), "'"); - } - if (properties.page_index_enabled(path)) { - return Status::Invalid( - "Experimental VECTOR encoding does not support page index " - "for column '", - path->ToDotString(), "'"); - } - if (properties.bloom_filter_options(path).has_value()) { - return Status::Invalid( - "Experimental VECTOR encoding does not support bloom filters " - "for column '", - path->ToDotString(), "'"); - } - if (properties.content_defined_chunking_enabled()) { - // CDC is a file-level switch; report it against the first vector column we see - // so the user gets a clear, actionable error rather than a late ParquetException - // from WriteLevels complaining about partial vector batches. - return Status::Invalid( - "Experimental VECTOR encoding does not support content-defined chunking " - "(column '", - path->ToDotString(), "' is VECTOR-encoded)"); - } - } + ARROW_UNUSED(schema); + ARROW_UNUSED(properties); return Status::OK(); } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 97f2f9a6fb35..36674a6ba949 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -83,6 +83,27 @@ namespace parquet { namespace { +std::vector<::parquet::internal::Chunk> AlignCdcChunksToVectorBoundaries( + std::vector<::parquet::internal::Chunk> chunks, int32_t vector_length) { + if (vector_length <= 1 || chunks.size() <= 1) { + return chunks; + } + + std::vector<::parquet::internal::Chunk> aligned; + aligned.reserve(chunks.size()); + auto pending = chunks.front(); + for (size_t i = 1; i < chunks.size(); ++i) { + if (pending.levels_to_write % vector_length == 0) { + aligned.push_back(pending); + pending = chunks[i]; + } else { + pending.levels_to_write += chunks[i].levels_to_write; + } + } + aligned.push_back(pending); + return aligned; +} + // Visitor that extracts the value buffer from a FlatArray at a given offset. struct ValueBufferSlicer { template @@ -1491,6 +1512,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, DCHECK(content_defined_chunker_.has_value()); auto chunks = content_defined_chunker_->GetChunks(def_levels, rep_levels, num_levels, leaf_array); + if (is_vector && descr_->max_repetition_level() == 0) { + chunks = AlignCdcChunksToVectorBoundaries(std::move(chunks), + descr_->effective_vector_length()); + } for (size_t i = 0; i < chunks.size(); i++) { auto chunk = chunks[i]; auto chunk_array = leaf_array.Slice(chunk.value_offset); From e5f0d38f77c31e51a6853790a6e330de1b60ce55 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 29 May 2026 16:52:45 +0200 Subject: [PATCH 3/5] work --- cpp/src/generated/parquet_types.cpp | 30 +- cpp/src/generated/parquet_types.h | 15 +- .../parquet/arrow/arrow_reader_writer_test.cc | 326 ------------------ cpp/src/parquet/arrow/schema.cc | 10 +- cpp/src/parquet/arrow/writer.cc | 9 - cpp/src/parquet/parquet.thrift | 18 +- cpp/src/parquet/properties.h | 4 +- 7 files changed, 33 insertions(+), 379 deletions(-) diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp index 3f8c9899a608..0392d3f6fc67 100644 --- a/cpp/src/generated/parquet_types.cpp +++ b/cpp/src/generated/parquet_types.cpp @@ -337,16 +337,13 @@ int _kFieldRepetitionTypeValues[] = { */ FieldRepetitionType::REPEATED, /** - * Experimental Option B prototype for fixed-size vector/list values. - * + * Experimental fixed-size vector/list values. * This field repeats vector_length times per parent value without increasing the - * maximum definition or repetition level of its descendants. VECTOR-aware - * readers use the fixed multiplicity from the schema to group the physical - * child values back into one logical vector value. Vector-level nullability is - * represented by an OPTIONAL parent node. - * - * Readers that do not understand VECTOR are expected to reject the file rather - * than attempting a LIST fallback. + * maximum definition or repetition level of its descendants. VECTOR-aware readers + * use the fixed multiplicity from the schema to group the physical child values + * back into one logical vector value. Vector-level nullability is represented by + * an OPTIONAL parent node. Readers that do not understand VECTOR are expected to + * reject the file rather than attempting a LIST fallback. */ FieldRepetitionType::VECTOR }; @@ -364,16 +361,13 @@ const char* _kFieldRepetitionTypeNames[] = { */ "REPEATED", /** - * Experimental Option B prototype for fixed-size vector/list values. - * + * Experimental fixed-size vector/list values. * This field repeats vector_length times per parent value without increasing the - * maximum definition or repetition level of its descendants. VECTOR-aware - * readers use the fixed multiplicity from the schema to group the physical - * child values back into one logical vector value. Vector-level nullability is - * represented by an OPTIONAL parent node. - * - * Readers that do not understand VECTOR are expected to reject the file rather - * than attempting a LIST fallback. + * maximum definition or repetition level of its descendants. VECTOR-aware readers + * use the fixed multiplicity from the schema to group the physical child values + * back into one logical vector value. Vector-level nullability is represented by + * an OPTIONAL parent node. Readers that do not understand VECTOR are expected to + * reject the file rather than attempting a LIST fallback. */ "VECTOR" }; diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h index 0bd7544114d7..bcc3512f7d14 100644 --- a/cpp/src/generated/parquet_types.h +++ b/cpp/src/generated/parquet_types.h @@ -204,16 +204,13 @@ struct FieldRepetitionType { */ REPEATED = 2, /** - * Experimental Option B prototype for fixed-size vector/list values. - * + * Experimental fixed-size vector/list values. * This field repeats vector_length times per parent value without increasing the - * maximum definition or repetition level of its descendants. VECTOR-aware - * readers use the fixed multiplicity from the schema to group the physical - * child values back into one logical vector value. Vector-level nullability is - * represented by an OPTIONAL parent node. - * - * Readers that do not understand VECTOR are expected to reject the file rather - * than attempting a LIST fallback. + * maximum definition or repetition level of its descendants. VECTOR-aware readers + * use the fixed multiplicity from the schema to group the physical child values + * back into one logical vector value. Vector-level nullability is represented by + * an OPTIONAL parent node. Readers that do not understand VECTOR are expected to + * reject the file rather than attempting a LIST fallback. */ VECTOR = 3 }; diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 40595938be7f..332a2636f187 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3491,23 +3491,6 @@ TEST(ArrowWriteOnly, FixedSizeListVectorNullRowsEmitFixedWidthNullSlots) { ASSERT_EQ(reader->metadata()->RowGroup(0)->ColumnChunk(0)->num_values(), 9); } -TEST(ArrowReadWrite, FixedSizeListVectorRequiredRoundTrip) { - auto table = MakeVectorFixedSizeListTable(R"([ - [1, 2, 3], - [4, 5, 6], - [7, 8, 9]])", - /*nullable=*/false); - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); -} - -TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTrip) { - auto table = MakeVectorFixedSizeListTable(R"([ - [1, 2, 3], - null, - [7, 8, 9]])"); - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/3)); -} - TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripNullPatterns) { const std::vector cases = { R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", @@ -3526,21 +3509,6 @@ TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripNullPatterns) { } } -TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripRowGroups) { - auto table = MakeVectorFixedSizeListTable(R"([ - [1, 2, 3], - null, - [7, 8, 9], - null, - [13, 14, 15], - [16, 17, 18]])"); - - ArrowReaderProperties reader_properties; - reader_properties.set_batch_size(1); - ASSERT_NO_FATAL_FAILURE( - CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2, reader_properties)); -} - TEST(ArrowReadWrite, FixedSizeListVectorStructOfVectorsRoundTrip) { auto ids_type = ::arrow::fixed_size_list( ::arrow::field("value", ::arrow::int32(), /*nullable=*/true), 2); @@ -3644,17 +3612,6 @@ TEST(ArrowReadWrite, FixedSizeListVectorNullableRoundTripSlicedInput) { ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } -TEST(ArrowReadWrite, FixedSizeListVectorRequiredRowsNullableElementsRoundTrip) { - auto table = MakeVectorFixedSizeListTable(R"([ - [1, null, 3], - [null, 5, 6], - [7, 8, null]])", - /*nullable=*/false, - /*element_nullable=*/true); - - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); -} - TEST(ArrowReadWrite, FixedSizeListVectorNullableRowsAndElementsRoundTrip) { auto table = MakeVectorFixedSizeListTable(R"([ [1, null, 3], @@ -3682,167 +3639,6 @@ TEST(ArrowReadWrite, FixedSizeListVectorNestedRoundTrip) { ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } -struct VectorPrimitiveRoundTripCase { - std::string name; - std::shared_ptr<::arrow::DataType> item_type; - std::shared_ptr<::arrow::DataType> expected_item_type; - std::string required_json; - std::string nullable_rows_json; - std::string nullable_rows_and_elements_json; -}; - -class VectorPrimitiveRoundTripTest - : public ::testing::TestWithParam {}; - -TEST_P(VectorPrimitiveRoundTripTest, RequiredRows) { - const auto& param = GetParam(); - auto table = MakeVectorFixedSizeListTable(param.item_type, param.required_json, - /*nullable=*/false, - /*element_nullable=*/false); - auto expected = - param.expected_item_type - ? MakeVectorFixedSizeListTable(param.expected_item_type, param.required_json, - /*nullable=*/false, - /*element_nullable=*/false) - : nullptr; - ArrowWriterProperties::Builder builder; - builder.enable_experimental_vector_encoding(); - ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip( - table, expected, VectorWriterProperties(), builder.build())); -} - -TEST_P(VectorPrimitiveRoundTripTest, NullableRows) { - const auto& param = GetParam(); - auto table = MakeVectorFixedSizeListTable(param.item_type, param.nullable_rows_json, - /*nullable=*/true, - /*element_nullable=*/false); - auto expected = param.expected_item_type - ? MakeVectorFixedSizeListTable(param.expected_item_type, - param.nullable_rows_json, - /*nullable=*/true, - /*element_nullable=*/false) - : nullptr; - ArrowWriterProperties::Builder builder; - builder.enable_experimental_vector_encoding(); - ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip( - table, expected, VectorWriterProperties(), builder.build())); -} - -TEST_P(VectorPrimitiveRoundTripTest, NullableRowsAndElements) { - const auto& param = GetParam(); - auto table = - MakeVectorFixedSizeListTable(param.item_type, param.nullable_rows_and_elements_json, - /*nullable=*/true, - /*element_nullable=*/true); - auto expected = param.expected_item_type ? MakeVectorFixedSizeListTable( - param.expected_item_type, - param.nullable_rows_and_elements_json, - /*nullable=*/true, - /*element_nullable=*/true) - : nullptr; - ArrowWriterProperties::Builder builder; - builder.enable_experimental_vector_encoding(); - ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip( - table, expected, VectorWriterProperties(), builder.build())); -} - -INSTANTIATE_TEST_SUITE_P( - PrimitiveElements, VectorPrimitiveRoundTripTest, - ::testing::Values( - VectorPrimitiveRoundTripCase{ - "Boolean", ::arrow::boolean(), nullptr, - R"([[true, false, true], [false, true, false], [true, true, false]])", - R"([[true, false, true], null, [false, true, false]])", - R"([[true, null, true], null, [null, true, false]])"}, - VectorPrimitiveRoundTripCase{ - "UInt8", ::arrow::uint8(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{"Int8", ::arrow::int8(), nullptr, - R"([[1, -2, 3], [4, -5, 6], [7, -8, 9]])", - R"([[1, -2, 3], null, [7, -8, 9]])", - R"([[1, null, 3], null, [null, -8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "UInt16", ::arrow::uint16(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{"Int16", ::arrow::int16(), nullptr, - R"([[1, -2, 3], [4, -5, 6], [7, -8, 9]])", - R"([[1, -2, 3], null, [7, -8, 9]])", - R"([[1, null, 3], null, [null, -8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "UInt32", ::arrow::uint32(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "Int32", ::arrow::int32(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "UInt64", ::arrow::uint64(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "Int64", ::arrow::int64(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "Float16", ::arrow::float16(), nullptr, - R"([[1.0, 2.0, 3.0], [4.5, 5.5, 6.5], [7.25, 8.25, 9.25]])", - R"([[1.0, 2.0, 3.0], null, [7.25, 8.25, 9.25]])", - R"([[1.0, null, 3.0], null, [null, 8.25, 9.25]])"}, - VectorPrimitiveRoundTripCase{ - "Float32", ::arrow::float32(), nullptr, - R"([[1.0, 2.0, 3.0], [4.5, 5.5, 6.5], [7.25, 8.25, 9.25]])", - R"([[1.0, 2.0, 3.0], null, [7.25, 8.25, 9.25]])", - R"([[1.0, null, 3.0], null, [null, 8.25, 9.25]])"}, - VectorPrimitiveRoundTripCase{ - "Float64", ::arrow::float64(), nullptr, - R"([[1.0, 2.0, 3.0], [4.5, 5.5, 6.5], [7.25, 8.25, 9.25]])", - R"([[1.0, 2.0, 3.0], null, [7.25, 8.25, 9.25]])", - R"([[1.0, null, 3.0], null, [null, 8.25, 9.25]])"}, - VectorPrimitiveRoundTripCase{ - "Date32", ::arrow::date32(), nullptr, R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "Time32Milli", ::arrow::time32(::arrow::TimeUnit::MILLI), nullptr, - R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", R"([[1, 2, 3], null, [7, 8, 9]])", - R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "Time64Micro", ::arrow::time64(::arrow::TimeUnit::MICRO), nullptr, - R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", R"([[1, 2, 3], null, [7, 8, 9]])", - R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "TimestampMilli", ::arrow::timestamp(::arrow::TimeUnit::MILLI), nullptr, - R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", R"([[1, 2, 3], null, [7, 8, 9]])", - R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "DurationMilli", ::arrow::duration(::arrow::TimeUnit::MILLI), - ::arrow::int64(), R"([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - R"([[1, 2, 3], null, [7, 8, 9]])", R"([[1, null, 3], null, [null, 8, 9]])"}, - VectorPrimitiveRoundTripCase{ - "FixedSizeBinary", ::arrow::fixed_size_binary(4), nullptr, - R"([["ab01", "cd02", "ef03"], ["gh04", "ij05", "kl06"], ["mn07", "op08", "qr09"]])", - R"([["ab01", "cd02", "ef03"], null, ["mn07", "op08", "qr09"]])", - R"([["ab01", null, "ef03"], null, [null, "op08", "qr09"]])"}, - VectorPrimitiveRoundTripCase{ - "Decimal32", ::arrow::decimal32(6, 3), ::arrow::decimal128(6, 3), - R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", - R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", - R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}, - VectorPrimitiveRoundTripCase{ - "Decimal64", ::arrow::decimal64(16, 3), ::arrow::decimal128(16, 3), - R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", - R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", - R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}, - VectorPrimitiveRoundTripCase{ - "Decimal128", ::arrow::decimal128(20, 3), nullptr, - R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", - R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", - R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}, - VectorPrimitiveRoundTripCase{ - "Decimal256", ::arrow::decimal256(40, 3), nullptr, - R"([["1.234", "2.345", "3.456"], ["4.567", "5.678", "6.789"], ["7.890", "8.901", "9.012"]])", - R"([["1.234", "2.345", "3.456"], null, ["7.890", "8.901", "9.012"]])", - R"([["1.234", null, "3.456"], null, [null, "8.901", "9.012"]])"}), - [](const ::testing::TestParamInfo& info) { - return info.param.name; - }); - TEST(ArrowReadWrite, FixedSizeListVectorMixedColumnsRoundTripAcrossRowGroups) { auto vector_type = ::arrow::fixed_size_list(::arrow::field("item", ::arrow::int16(), true), @@ -3900,51 +3696,6 @@ TEST(ArrowReadWrite, FixedSizeListVectorMixedColumnsRoundTripAcrossRowGroups) { ::arrow::AssertTablesEqual(*table, *out, false); } -TEST(ArrowReadWrite, FixedSizeListVectorMixedColumnsRoundtripEqualsOriginal) { - constexpr int32_t kVectorSize = 4; - auto vector_type = ::arrow::fixed_size_list( - ::arrow::field("item", ::arrow::float32(), false), kVectorSize); - auto list_type = ::arrow::list(::arrow::field("item", ::arrow::int32(), false)); - - auto table = ::arrow::Table::Make( - ::arrow::schema({ - ::arrow::field("id", ::arrow::int32(), false), - ::arrow::field("embedding", vector_type, false), - ::arrow::field("label", ::arrow::utf8(), false), - ::arrow::field("tags", list_type, false), - }), - { - ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 1, 2, 3, 4]"), - ::arrow::ArrayFromJSON(vector_type, R"([ - [0.0, 0.1, 0.2, 0.3], - [1.0, 1.1, 1.2, 1.3], - [2.0, 2.1, 2.2, 2.3], - [3.0, 3.1, 3.2, 3.3], - [4.0, 4.1, 4.2, 4.3] - ])"), - ::arrow::ArrayFromJSON(::arrow::utf8(), - R"(["alpha", "bravo", "charlie", "delta", "echo"])"), - ::arrow::ArrayFromJSON(list_type, - R"([[10, 11], [], [12], [13, 14, 15], [16, 17]])"), - }); - - ArrowWriterProperties::Builder writer_builder; - writer_builder.enable_experimental_vector_encoding(); - ASSERT_OK_AND_ASSIGN( - auto buffer, WriteTableToBuffer(table, /*row_group_size=*/table->num_rows(), - VectorWriterProperties(), writer_builder.build())); - - std::unique_ptr reader; - FileReaderBuilder reader_builder; - ASSERT_OK(reader_builder.Open(std::make_shared(buffer))); - ASSERT_OK(reader_builder.Build(&reader)); - std::shared_ptr
out; - ASSERT_OK(reader->ReadTable(&out)); - - ::arrow::AssertSchemaEqual(*table->schema(), *out->schema(), false); - ::arrow::AssertTablesEqual(*table, *out, false); -} - std::shared_ptr<::arrow::DataType> VectorFixedSizeListStructType( bool element_nullable = false, bool field_nullable = false) { return ::arrow::fixed_size_list( @@ -3990,38 +3741,6 @@ std::shared_ptr
MakeVectorFixedSizeListNestedStructTable( return ::arrow::Table::Make(::arrow::schema({field}), {array}); } -TEST(ArrowReadWrite, FixedSizeListVectorStructRoundTrip) { - auto table = MakeVectorFixedSizeListStructTable(R"([ - [{"x": 1.0, "y": 1}, {"x": 2.0, "y": 2}], - [{"x": 3.0, "y": 3}, {"x": 4.0, "y": 4}], - [{"x": 5.0, "y": 5}, {"x": 6.0, "y": 6}]])", - /*nullable=*/false); - - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); -} - -TEST(ArrowReadWrite, FixedSizeListVectorStructNullableRoundTrip) { - auto table = MakeVectorFixedSizeListStructTable(R"([ - [{"x": 1.0, "y": 1}, {"x": 2.0, "y": 2}], - null, - [{"x": 5.0, "y": 5}, {"x": 6.0, "y": 6}], - null])"); - - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); -} - -TEST(ArrowReadWrite, FixedSizeListVectorStructNullableFieldsRoundTrip) { - auto table = MakeVectorFixedSizeListStructTable(R"([ - [{"x": 1.0, "y": 1}, {"x": 2.0, "y": null}], - null, - [{"x": 5.0, "y": null}, {"x": 6.0, "y": 6}]])", - /*nullable=*/true, - /*element_nullable=*/false, - /*field_nullable=*/true); - - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); -} - TEST(ArrowReadWrite, FixedSizeListVectorNullableStructElementsRoundTrip) { auto table = MakeVectorFixedSizeListStructTable(R"([ [{"x": 1.0, "y": 1}, null], @@ -4035,20 +3754,6 @@ TEST(ArrowReadWrite, FixedSizeListVectorNullableStructElementsRoundTrip) { ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } -TEST(ArrowReadWrite, FixedSizeListVectorNestedStructRoundTrip) { - auto table = MakeVectorFixedSizeListNestedStructTable(R"([ - [{"point": {"x": 1.0, "y": 1}, "z": 10}, - {"point": {"x": 2.0, "y": null}, "z": null}], - [{"point": {"x": 3.0, "y": 3}, "z": 30}, - {"point": {"x": 4.0, "y": 4}, "z": 40}], - [{"point": {"x": 5.0, "y": null}, "z": 50}, - {"point": {"x": 6.0, "y": 6}, "z": 60}]])", - /*nullable=*/false, - /*element_nullable=*/false); - - ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); -} - TEST(ArrowReadWrite, FixedSizeListVectorNullableNestedStructElementsRoundTrip) { auto table = MakeVectorFixedSizeListNestedStructTable(R"([ [{"point": {"x": 1.0, "y": 1}, "z": 10}, null], @@ -4081,37 +3786,6 @@ TEST(ArrowWriteOnly, FixedSizeListVectorDefaultWriterProperties) { encodings.end()); } -TEST(ArrowWriteOnly, FixedSizeListVectorElementStatistics) { - auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ - [1, 2, 3], - [4, 5, 6]])", - /*nullable=*/false); - - auto writer_properties = WriterProperties::Builder() - .disable_dictionary() - ->encoding(Encoding::PLAIN) - ->build(); - ArrowWriterProperties::Builder arrow_builder; - arrow_builder.enable_experimental_vector_encoding(); - ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, - writer_properties, - arrow_builder.build())); - - auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); - auto column = parquet_reader->metadata()->RowGroup(0)->ColumnChunk(0); - ASSERT_TRUE(column->is_stats_set()); - auto statistics = column->statistics(); - ASSERT_EQ(6, statistics->num_values()); - ASSERT_EQ(0, statistics->null_count()); - - std::shared_ptr min, max; - ASSERT_OK(StatisticsAsScalars(*statistics, &min, &max)); - ASSERT_OK_AND_ASSIGN(auto expected_min, ::arrow::MakeScalar(::arrow::int32(), 1)); - ASSERT_OK_AND_ASSIGN(auto expected_max, ::arrow::MakeScalar(::arrow::int32(), 6)); - ::arrow::AssertScalarsEqual(*expected_min, *min, /*verbose=*/true); - ::arrow::AssertScalarsEqual(*expected_max, *max, /*verbose=*/true); -} - TEST(ArrowWriteOnly, FixedSizeListVectorNullableElementStatistics) { auto table = MakeVectorFixedSizeListTable(::arrow::int32(), R"([ [1, 2, 3], diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 3c0e502a2f57..0893efeb700a 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -136,8 +136,8 @@ Status ValidateSupportedVectorStructNode(const Node& node) { return Status::OK(); } return Status::NotImplemented( - "VECTOR elements only support primitive, nested VECTOR, or nested struct fields " - "in the current prototype; repeated/list/map descendants are deferred"); + "VECTOR elements only support primitive, nested VECTOR, or nested struct fields; " + "repeated/list/map descendants are not supported"); } Status FixedSizeListValueToVectorNode( @@ -581,9 +581,9 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, case ArrowTypeId::FIXED_SIZE_LIST: { auto list_type = std::static_pointer_cast<::arrow::FixedSizeListType>(field->type()); - // Experimental VECTOR encoding is opportunistic: use it for FixedSizeList - // fields supported by this Option B prototype, and preserve writability by - // falling back to the standard LIST encoding for unsupported fields. This + // Experimental VECTOR encoding is opportunistic: use it for supported + // FixedSizeList fields, and preserve writability by falling back to the + // standard LIST encoding for unsupported fields. This // lets schemas mix VECTOR-friendly dense numeric/struct vectors with // FixedSizeList values that still need LIST machinery (for example strings). if (arrow_properties.write_fixed_size_list_as_vector() && diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 06a6f681e649..f78a2975cfa4 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -188,13 +188,6 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo return Status::OK(); } -Status ValidateVectorColumnProperties(const SchemaDescriptor* schema, - const WriterProperties& properties) { - ARROW_UNUSED(schema); - ARROW_UNUSED(properties); - return Status::OK(); -} - // Manages writing nested parquet columns with support for all nested types // supported by parquet. class ArrowColumnWriterV2 { @@ -396,8 +389,6 @@ class FileWriterImpl : public FileWriter { } Status Init() { - RETURN_NOT_OK( - ValidateVectorColumnProperties(writer_->schema(), *writer_->properties())); return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr, default_arrow_reader_properties(), &schema_manifest_); } diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index 633d55d649a5..ee59ff5eb8fa 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -191,17 +191,13 @@ enum FieldRepetitionType { /** The field is repeated and can contain 0 or more values */ REPEATED = 2; - /** - * Experimental Option B prototype for fixed-size vector/list values. - * + /** Experimental fixed-size vector/list values. * This field repeats vector_length times per parent value without increasing the - * maximum definition or repetition level of its descendants. VECTOR-aware - * readers use the fixed multiplicity from the schema to group the physical - * child values back into one logical vector value. Vector-level nullability is - * represented by an OPTIONAL parent node. - * - * Readers that do not understand VECTOR are expected to reject the file rather - * than attempting a LIST fallback. + * maximum definition or repetition level of its descendants. VECTOR-aware readers + * use the fixed multiplicity from the schema to group the physical child values + * back into one logical vector value. Vector-level nullability is represented by + * an OPTIONAL parent node. Readers that do not understand VECTOR are expected to + * reject the file rather than attempting a LIST fallback. */ VECTOR = 3; } @@ -1111,7 +1107,7 @@ union ColumnOrder { * - If the min is +0, the row group may contain -0 values as well. * - If the max is -0, the row group may contain +0 values as well. * - When looking for NaN values, min and max should be ignored. - * + * * When writing statistics the following rules should be followed: * - NaNs should not be written to min or max statistics fields. * - If the computed max value is zero (whether negative or positive), diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index cf6813e06438..2cf375ccefe4 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -1368,7 +1368,9 @@ class PARQUET_EXPORT ArrowWriterProperties { return this; } - /// \brief EXPERIMENTAL: encode Arrow FixedSizeList as Parquet VECTOR. + /// \brief EXPERIMENTAL: encode supported Arrow FixedSizeList values as Parquet + /// VECTOR. Unsupported FixedSizeList values, including zero-length lists, continue + /// to use the standard LIST encoding. Builder* enable_experimental_vector_encoding() { write_fixed_size_list_as_vector_ = true; return this; From 1940c5d277ff88d6a740b2b60fc44d792bcc0b08 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 29 May 2026 18:15:18 +0200 Subject: [PATCH 4/5] add python wrapper --- python/pyarrow/_dataset_parquet.pyx | 7 ++++++ python/pyarrow/_parquet.pxd | 1 + python/pyarrow/_parquet.pyx | 12 ++++++++-- python/pyarrow/includes/libparquet.pxd | 1 + python/pyarrow/parquet/core.py | 8 +++++++ .../pyarrow/tests/parquet/test_data_types.py | 22 +++++++++++++++++++ 6 files changed, 49 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 534f7790923a..6567b95a9ab1 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -620,6 +620,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): "coerce_timestamps", "allow_truncated_timestamps", "use_compliant_nested_type", + "experimental_vector_encoding", } setters = set() @@ -676,6 +677,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): writer_engine_version="V2", use_compliant_nested_type=( self._properties["use_compliant_nested_type"] + ), + store_schema=True, + write_time_adjusted_to_utc=False, + experimental_vector_encoding=( + self._properties["experimental_vector_encoding"] ) ) @@ -705,6 +711,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): coerce_timestamps=None, allow_truncated_timestamps=False, use_compliant_nested_type=True, + experimental_vector_encoding=False, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 36fc2ccf2f33..fa0a06e53c7e 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -69,6 +69,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( use_compliant_nested_type=*, store_schema=*, write_time_adjusted_to_utc=*, + experimental_vector_encoding=*, ) except * diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 2358a961ebd9..39cb5dac42f0 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2281,7 +2281,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( writer_engine_version=None, use_compliant_nested_type=True, store_schema=True, - write_time_adjusted_to_utc=False) except *: + write_time_adjusted_to_utc=False, + experimental_vector_encoding=False) except *: """Arrow writer properties""" cdef: shared_ptr[ArrowWriterProperties] arrow_properties @@ -2322,6 +2323,11 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( else: arrow_props.disable_compliant_nested_types() + # experimental_vector_encoding + + if experimental_vector_encoding: + arrow_props.enable_experimental_vector_encoding() + # writer_engine_version if writer_engine_version == "V1": @@ -2396,7 +2402,8 @@ cdef class ParquetWriter(_Weakrefable): store_decimal_as_integer=False, use_content_defined_chunking=False, write_time_adjusted_to_utc=False, - bloom_filter_options=None): + bloom_filter_options=None, + experimental_vector_encoding=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -2443,6 +2450,7 @@ cdef class ParquetWriter(_Weakrefable): use_compliant_nested_type=use_compliant_nested_type, store_schema=store_schema, write_time_adjusted_to_utc=write_time_adjusted_to_utc, + experimental_vector_encoding=experimental_vector_encoding, ) pool = maybe_unbox_memory_pool(memory_pool) diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index a834bd5dfa0c..b1f7e0240607 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -528,6 +528,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* store_schema() Builder* enable_compliant_nested_types() Builder* disable_compliant_nested_types() + Builder* enable_experimental_vector_encoding() Builder* set_engine_version(ArrowWriterEngineVersion version) Builder* set_time_adjusted_to_utc(c_bool adjusted) shared_ptr[ArrowWriterProperties] build() diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 080bfa55c234..6d281400ce91 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -876,6 +876,10 @@ def _sanitize_table(table, new_schema, flavor): dictionary_pagesize_limit : int, default None Specify the dictionary page size limit per row group. If None, use the default 1MB. +experimental_vector_encoding : bool, default False + EXPERIMENTAL: Encode supported fixed-size list values, including the storage + of fixed-shape tensor extension arrays, using Parquet VECTOR repetition. + Unsupported fixed-size list values continue using standard LIST encoding. store_schema : bool, default True By default, the Arrow schema is serialized and stored in the Parquet file metadata (in the "ARROW:schema" key). When reading the file, @@ -1083,6 +1087,7 @@ def __init__(self, where, schema, filesystem=None, store_decimal_as_integer=False, write_time_adjusted_to_utc=False, max_rows_per_page=None, + experimental_vector_encoding=False, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1138,6 +1143,7 @@ def __init__(self, where, schema, filesystem=None, store_decimal_as_integer=store_decimal_as_integer, write_time_adjusted_to_utc=write_time_adjusted_to_utc, max_rows_per_page=max_rows_per_page, + experimental_vector_encoding=experimental_vector_encoding, **options) self.is_open = True @@ -2017,6 +2023,7 @@ def write_table(table, where, row_group_size=None, version='2.6', write_time_adjusted_to_utc=False, max_rows_per_page=None, bloom_filter_options=None, + experimental_vector_encoding=False, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -2051,6 +2058,7 @@ def write_table(table, where, row_group_size=None, version='2.6', write_time_adjusted_to_utc=write_time_adjusted_to_utc, max_rows_per_page=max_rows_per_page, bloom_filter_options=bloom_filter_options, + experimental_vector_encoding=experimental_vector_encoding, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index c546bc1532ac..3c965b2325e4 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -581,6 +581,28 @@ def test_json_extension_type(storage_type): store_schema=False) +def test_fixed_shape_tensor_vector_encoding(): + tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) + storage = pa.array([ + [1, 2, 3, 4, 5, 6], + None, + [7, 8, 9, 10, 11, 12], + ], pa.list_(pa.float32(), 6)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + table = pa.table({"tensor": arr}) + + sink = pa.BufferOutputStream() + pq.write_table(table, sink, experimental_vector_encoding=True) + buf = sink.getvalue() + + metadata = pq.read_metadata(pa.BufferReader(buf)) + assert "vector" in str(metadata.schema).lower() + + result = pq.read_table(pa.BufferReader(buf)) + assert result.schema == table.schema + assert result.equals(table) + + def test_uuid_extension_type(): data = [ b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', From 3c2e077d22dc12a2c744eb01785975d55756d38d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 29 May 2026 20:33:44 +0200 Subject: [PATCH 5/5] lint --- .../parquet/arrow/arrow_reader_writer_test.cc | 45 ++++++------- cpp/src/parquet/arrow/path_internal.cc | 16 ++--- cpp/src/parquet/arrow/reader.cc | 3 +- cpp/src/parquet/arrow/schema.cc | 67 +++++++++---------- cpp/src/parquet/types.h | 1 - 5 files changed, 64 insertions(+), 68 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 332a2636f187..2a1c4eeda5db 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3520,8 +3520,8 @@ TEST(ArrowReadWrite, FixedSizeListVectorStructOfVectorsRoundTrip) { {"ids": [10, null], "scores": [0.1, 0.2, 0.3]}, {"ids": null, "scores": [1.1, 1.2, 1.3]}, {"ids": [30, 40], "scores": [2.1, 2.2, 2.3]}])"); - auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), - {array}); + auto table = + ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), {array}); ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } @@ -3533,37 +3533,37 @@ TEST(ArrowReadWrite, FixedSizeListVectorListOfVectorsRoundTrip) { [[1, 2], [3, 4]], [[5, 6]], [[7, 8], [9, 10], [11, 12]]])"); - auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), - {array}); + auto table = + ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), {array}); ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } TEST(ArrowReadWrite, FixedSizeListVectorListOfStructsWithVectorsRoundTrip) { auto vector_type = ::arrow::fixed_size_list(::arrow::float32(), 3); - auto struct_type = ::arrow::struct_( - {::arrow::field("embedding", vector_type, /*nullable=*/false)}); + auto struct_type = + ::arrow::struct_({::arrow::field("embedding", vector_type, /*nullable=*/false)}); auto type = ::arrow::list(::arrow::field("item", struct_type, /*nullable=*/false)); auto array = ::arrow::ArrayFromJSON(type, R"([ [{"embedding": [1.0, 2.0, 3.0]}, {"embedding": [4.0, 5.0, 6.0]}], [{"embedding": [7.0, 8.0, 9.0]}], [{"embedding": [10.0, 11.0, 12.0]}, {"embedding": [13.0, 14.0, 15.0]}]])"); - auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), - {array}); + auto table = + ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), {array}); ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } TEST(ArrowReadWrite, FixedSizeListVectorNestedVectorsRoundTrip) { auto inner_type = ::arrow::fixed_size_list(::arrow::uint8(), 2); - auto type = ::arrow::fixed_size_list( - ::arrow::field("item", inner_type, /*nullable=*/false), 2); + auto type = + ::arrow::fixed_size_list(::arrow::field("item", inner_type, /*nullable=*/false), 2); auto array = ::arrow::ArrayFromJSON(type, R"([ [[1, 2], [3, 4]], null, [[5, 6], [7, 8]]])"); - auto table = ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), - {array}); + auto table = + ::arrow::Table::Make(::arrow::schema({::arrow::field("root", type)}), {array}); ASSERT_NO_FATAL_FAILURE(CheckVectorFixedSizeListRoundtrip(table, /*row_group_size=*/2)); } @@ -3800,9 +3800,9 @@ TEST(ArrowWriteOnly, FixedSizeListVectorNullableElementStatistics) { ->build(); ArrowWriterProperties::Builder arrow_builder; arrow_builder.enable_experimental_vector_encoding(); - ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/3, - writer_properties, - arrow_builder.build())); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/3, writer_properties, + arrow_builder.build())); auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); auto column = parquet_reader->metadata()->RowGroup(0)->ColumnChunk(0); @@ -3833,9 +3833,9 @@ TEST(ArrowWriteOnly, FixedSizeListVectorPageIndex) { ->build(); ArrowWriterProperties::Builder arrow_builder; arrow_builder.enable_experimental_vector_encoding(); - ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, - writer_properties, - arrow_builder.build())); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/2, writer_properties, + arrow_builder.build())); auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); auto page_index_reader = parquet_reader->GetPageIndexReader(); @@ -3860,9 +3860,9 @@ TEST(ArrowWriteOnly, FixedSizeListVectorBloomFilter) { ->build(); ArrowWriterProperties::Builder arrow_builder; arrow_builder.enable_experimental_vector_encoding(); - ASSERT_OK_AND_ASSIGN(auto buffer, WriteTableToBuffer(table, /*row_group_size=*/2, - writer_properties, - arrow_builder.build())); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/2, writer_properties, + arrow_builder.build())); auto parquet_reader = ParquetFileReader::Open(std::make_shared(buffer)); auto& bloom_filter_reader = parquet_reader->GetBloomFilterReader(); @@ -3882,8 +3882,7 @@ TEST(ArrowReadWrite, FixedSizeListVectorContentDefinedChunkingRoundTrip) { if (i != 0) { json << ","; } - json << "[" << (3 * i + 1) << "," << (3 * i + 2) << "," << (3 * i + 3) - << "]"; + json << "[" << (3 * i + 1) << "," << (3 * i + 2) << "," << (3 * i + 3) << "]"; } json << "]"; auto table = MakeVectorFixedSizeListTable(::arrow::int32(), json.str(), diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index c131f80c244d..df64bb020ba0 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -669,10 +669,11 @@ Status ExpandVectorRepLevels(PathInfo* path_info, PathWriteContext* context, const int64_t rep_length = context->rep_levels.length(); const int64_t def_length = context->def_levels.length(); if (def_length != rep_length * path_info->leaf_vector_length) { - return Status::Invalid("VECTOR repetition level expansion mismatch: repetition " - "level count ", - rep_length, ", definition level count ", def_length, - ", vector length ", path_info->leaf_vector_length); + return Status::Invalid( + "VECTOR repetition level expansion mismatch: repetition " + "level count ", + rep_length, ", definition level count ", def_length, ", vector length ", + path_info->leaf_vector_length); } ExpandVectorRepLevels(context->rep_levels.data(), rep_length, path_info->leaf_vector_length, path_info->max_rep_level, @@ -977,10 +978,9 @@ class PathBuilder { const bool value_type_is_struct = array.value_type()->id() == ::arrow::Type::STRUCT; const bool value_type_is_nested_vector = array.value_type()->id() == ::arrow::Type::FIXED_SIZE_LIST; - const bool child_emits_present_def_levels = element_nullable || - value_type_is_struct || - value_type_is_nested_vector || - nested_vector_path; + const bool child_emits_present_def_levels = + element_nullable || value_type_is_struct || value_type_is_nested_vector || + nested_vector_path; const bool child_records_visited_elements = value_type_is_nested_vector; const bool has_vector_node = parent_nullable || element_nullable || value_type_is_struct || value_type_is_nested_vector || diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 091d536b32ee..8ebaffdceeef 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -782,8 +782,7 @@ class PARQUET_NO_EXPORT VectorFixedSizeListReader : public ColumnReaderImpl { RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_rep_levels)); if (rep_levels != nullptr && num_rep_levels != num_levels) { return Status::Invalid("VECTOR child produced ", num_rep_levels, - " repetition levels for ", num_levels, - " definition levels"); + " repetition levels for ", num_levels, " definition levels"); } if (num_levels % list_size_ != 0) { return Status::Invalid("VECTOR column produced a non-multiple of vector length ", diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 0893efeb700a..0635a82aa30c 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -140,10 +140,11 @@ Status ValidateSupportedVectorStructNode(const Node& node) { "repeated/list/map descendants are not supported"); } -Status FixedSizeListValueToVectorNode( - const std::shared_ptr<::arrow::Field>& value_field, const std::string& value_name, - int32_t list_size, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, NodePtr* out) { +Status FixedSizeListValueToVectorNode(const std::shared_ptr<::arrow::Field>& value_field, + const std::string& value_name, int32_t list_size, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + NodePtr* out) { if (list_size <= 0) { return Status::NotImplemented( "VECTOR repetition does not support zero-length FixedSizeList values"); @@ -155,11 +156,11 @@ Status FixedSizeListValueToVectorNode( } if (value_field->type()->id() == ::arrow::Type::FIXED_SIZE_LIST) { - const auto& nested_type = checked_cast( - *value_field->type()); - const std::string nested_value_name = - arrow_properties.compliant_nested_types() ? "element" - : nested_type.value_field()->name(); + const auto& nested_type = + checked_cast(*value_field->type()); + const std::string nested_value_name = arrow_properties.compliant_nested_types() + ? "element" + : nested_type.value_field()->name(); NodePtr nested_vector; RETURN_NOT_OK(FixedSizeListValueToVectorNode( nested_type.value_field(), nested_value_name, nested_type.list_size(), properties, @@ -188,14 +189,15 @@ Status FixedSizeListValueToVectorNode( /*logical_type=*/nullptr, /*field_id=*/-1, list_size); } else { const auto& primitive = checked_cast(*element); - *out = PrimitiveNode::Make(value_name, Repetition::VECTOR, - primitive.logical_type(), primitive.physical_type(), - primitive.type_length(), primitive.field_id(), list_size); + *out = PrimitiveNode::Make(value_name, Repetition::VECTOR, primitive.logical_type(), + primitive.physical_type(), primitive.type_length(), + primitive.field_id(), list_size); } return Status::OK(); } - const auto& struct_type = checked_cast(*value_field->type()); + const auto& struct_type = + checked_cast(*value_field->type()); std::vector children(struct_type.num_fields()); for (int i = 0; i < struct_type.num_fields(); ++i) { RETURN_NOT_OK(FieldToNode(struct_type.field(i)->name(), struct_type.field(i), @@ -227,9 +229,8 @@ Status FixedSizeListToNode(const std::shared_ptr<::arrow::FixedSizeListType>& ty arrow_properties.compliant_nested_types() ? "element" : value_field->name(); NodePtr vector; - RETURN_NOT_OK(FixedSizeListValueToVectorNode(value_field, value_name, - type->list_size(), properties, - arrow_properties, &vector)); + RETURN_NOT_OK(FixedSizeListValueToVectorNode(value_field, value_name, type->list_size(), + properties, arrow_properties, &vector)); RETURN_NOT_OK(ValidateSupportedVectorStructNode(*vector)); *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {vector}, /*logical_type=*/nullptr, field_id); @@ -799,15 +800,14 @@ Status VectorNodeToSchemaField(const Node& vector_node, LevelInfo current_levels GetTypeForNode(column_index, primitive_node, ctx)); auto item_field = ::arrow::field(vector_node.name(), type, /*nullable=*/false, FieldIdMetadata(vector_node.field_id())); - RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, - child_field)); + RETURN_NOT_OK( + PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); } else { const auto& vector_group = static_cast(vector_node); if (vector_group.field_count() == 1 && vector_group.field(0)->is_vector()) { RETURN_NOT_OK(VectorNodeToSchemaField(*vector_group.field(0), current_levels, ctx, out, child_field)); - } else if (vector_group.field_count() == 1 && - vector_group.field(0)->is_group() && + } else if (vector_group.field_count() == 1 && vector_group.field(0)->is_group() && vector_group.field(0)->is_optional()) { const auto& optional_group = static_cast(*vector_group.field(0)); if (optional_group.field_count() == 1 && optional_group.field(0)->is_vector()) { @@ -818,11 +818,10 @@ Status VectorNodeToSchemaField(const Node& vector_node, LevelInfo current_levels } else { RETURN_NOT_OK(ValidateSupportedVectorStructNode(optional_group)); current_levels.IncrementOptional(); - RETURN_NOT_OK(GroupToStruct(optional_group, current_levels, ctx, out, - child_field)); + RETURN_NOT_OK( + GroupToStruct(optional_group, current_levels, ctx, out, child_field)); } - } else if (vector_group.field_count() == 1 && - vector_group.field(0)->is_primitive()) { + } else if (vector_group.field_count() == 1 && vector_group.field(0)->is_primitive()) { const Node& element = *vector_group.field(0); bool element_nullable = element.is_optional(); if (element_nullable) { @@ -834,8 +833,8 @@ Status VectorNodeToSchemaField(const Node& vector_node, LevelInfo current_levels GetTypeForNode(column_index, primitive_node, ctx)); auto item_field = ::arrow::field(element.name(), type, element_nullable, FieldIdMetadata(element.field_id())); - RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, - child_field)); + RETURN_NOT_OK( + PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); } else { RETURN_NOT_OK(ValidateSupportedVectorStructNode(vector_group)); RETURN_NOT_OK(GroupToStruct(vector_group, current_levels, ctx, out, child_field)); @@ -895,8 +894,8 @@ Status VectorToSchemaField(const GroupNode& group, LevelInfo current_levels, if (vector_group.field_count() == 1) { const Node& element = *vector_group.field(0); if (element.is_vector()) { - RETURN_NOT_OK(VectorNodeToSchemaField(element, current_levels, ctx, out, - child_field)); + RETURN_NOT_OK( + VectorNodeToSchemaField(element, current_levels, ctx, out, child_field)); } else if (element.is_repeated()) { return Status::Invalid("VECTOR element children must not be REPEATED"); } else if (element.is_primitive()) { @@ -925,8 +924,8 @@ Status VectorToSchemaField(const GroupNode& group, LevelInfo current_levels, if (element_group.is_optional()) { current_levels.IncrementOptional(); } - RETURN_NOT_OK(GroupToStruct(element_group, current_levels, ctx, out, - child_field)); + RETURN_NOT_OK( + GroupToStruct(element_group, current_levels, ctx, out, child_field)); } } } else { @@ -1240,12 +1239,12 @@ Status NodeToSchemaField(const Node& node, LevelInfo current_levels, ctx->LinkParent(child_field, out); auto item_field = ::arrow::field(node.name(), type, /*nullable=*/false, FieldIdMetadata(node.field_id())); - RETURN_NOT_OK(PopulateLeaf(column_index, item_field, current_levels, ctx, out, - child_field)); + RETURN_NOT_OK( + PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); MarkVectorSubtree(child_field); out->field = ::arrow::field( - node.name(), ::arrow::fixed_size_list(child_field->field, - primitive_node.vector_length()), + node.name(), + ::arrow::fixed_size_list(child_field->field, primitive_node.vector_length()), /*nullable=*/false, FieldIdMetadata(node.field_id())); out->level_info = current_levels; out->is_vector = true; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 3d639383e4ee..74a9600b5aee 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -237,7 +237,6 @@ class PARQUET_EXPORT LogicalType { static std::shared_ptr Variant( int8_t specVersion = kVariantSpecVersion); - static std::shared_ptr Geometry(std::string crs = ""); static std::shared_ptr Geography(