From fada4892a8123f395ccb5bfb3a1340904cb11257 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 22:20:16 +0800 Subject: [PATCH] add a roundtrip test --- .../parquet/arrow/arrow_reader_writer_test.cc | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 5d990a5c6bd4a..97434c1cbfa09 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4093,6 +4093,62 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { TryReadDataFile(path); } +TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) { + // Create schema with a nested list of two-level encoding: + constexpr std::string_view kExpectedSchema = + "required group field_id=-1 schema {\n" + " optional group field_id=-1 nested_list (List) {\n" + " repeated group field_id=-1 array (List) {\n" + " repeated int32 field_id=-1 array;\n" + " }\n" + " }\n" + "}\n"; + auto inner_element = PrimitiveNode::Make("array", Repetition::REPEATED, Type::INT32); + auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, + ConvertedType::LIST); + auto nested_list = GroupNode::Make("nested_list", Repetition::OPTIONAL, {outer_element}, + ConvertedType::LIST); + auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {nested_list}); + + // Create a Parquet writer to write values of nested list + auto sink = CreateOutputStream(); + auto file_writer = ParquetFileWriter::Open( + sink, std::dynamic_pointer_cast(schema_node)); + auto row_group_writer = file_writer->AppendRowGroup(); + auto int_writer = dynamic_cast(row_group_writer->NextColumn()); + ASSERT_TRUE(int_writer != nullptr); + + // Directly write a single row of nested list: [[1, 2],[3, 4]] + constexpr int64_t kNumValues = 4; + std::array rep_levels = {0, 2, 1, 2}; + std::array def_levels = {3, 3, 3, 3}; + std::array values = {1, 2, 3, 4}; + int_writer->WriteBatch(kNumValues, def_levels.data(), rep_levels.data(), values.data()); + file_writer->Close(); + + // Read schema and verify it applies two-level encoding of list type + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); + ASSERT_EQ(kExpectedSchema, file_reader->metadata()->schema()->ToString()); + + // Read and verify data + std::unique_ptr reader; + ASSERT_OK(FileReader::Make(default_memory_pool(), std::move(file_reader), &reader)); + std::shared_ptr table; + ASSERT_OK(reader->ReadTable(&table)); + + auto arrow_inner_element = + ::arrow::field("array", ::arrow::int32(), /*nullable=*/false); + auto arrow_outer_element = + ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); + auto arrow_list = ::arrow::list(arrow_outer_element); + auto arrow_schema = + ::arrow::schema({::arrow::field("nested_list", arrow_list, /*nullable=*/true)}); + auto expected_table = ::arrow::TableFromJSON(arrow_schema, {R"([[[[1,2],[3,4]]]])"}); + ::arrow::AssertTablesEqual(*expected_table, *table); +} + class TestArrowReaderAdHocSparkAndHvr : public ::testing::TestWithParam< std::tuple>> {};