From 26cf0e0b154b188499676466579e977829c346f6 Mon Sep 17 00:00:00 2001 From: Ben Harkins <60872452+benibus@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:35:33 -0500 Subject: [PATCH] GH-38751: [C++][Go][Parquet] Add tests for reading Float16 files in parquet-testing (#38753) ### Rationale for this change Validates compatibility between implementations when reading `Float16` columns. ### What changes are included in this PR? - Bumps `parquet-testing` commit to latest to use the recently-added files - Adds reader tests for C++ and Go in the same vein as https://github.com/apache/arrow-rs/pull/5003 ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #38751 Authored-by: benibus Signed-off-by: Matt Topol --- .../parquet/arrow/arrow_reader_writer_test.cc | 54 +++++++++++++++ cpp/submodules/parquet-testing | 2 +- go/parquet/pqarrow/file_reader_test.go | 67 +++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index a314ecbf747e7..9c6f7a044b589 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4099,6 +4099,60 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2)), std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 2)))); +TEST(TestArrowReaderAdHoc, ReadFloat16Files) { + using ::arrow::util::Float16; + constexpr auto nan = std::numeric_limits::quiet_NaN(); + + struct TestCase { + std::string filename; + int32_t len; + std::vector vals; + } test_cases[] = { + {"float16_nonzeros_and_nans", + 8, + {Float16(+1.0), Float16(-2.0), nan, Float16(+0.0), Float16(-1.0), Float16(-0.0), + Float16(+2.0)}}, + {"float16_zeros_and_nans", 3, {Float16(+0.0), nan}}, + }; + + const auto pool = ::arrow::default_memory_pool(); + + for (const auto& tc : test_cases) { + std::string path(test::get_data_dir()); + path += "/" + tc.filename + ".parquet"; + ARROW_SCOPED_TRACE("path = ", path); + + std::unique_ptr reader; + ASSERT_OK_NO_THROW( + FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &reader)); + std::shared_ptr<::arrow::Table> table; + ASSERT_OK_NO_THROW(reader->ReadTable(&table)); + + std::shared_ptr<::arrow::Schema> schema; + ASSERT_OK_NO_THROW(reader->GetSchema(&schema)); + ASSERT_EQ(1, schema->num_fields()); + ASSERT_EQ(schema->field(0)->type()->id(), ::arrow::Type::HALF_FLOAT); + + ASSERT_EQ(1, table->num_columns()); + auto column = table->column(0); + ASSERT_EQ(tc.len, column->length()); + ASSERT_EQ(1, column->num_chunks()); + + auto chunk = checked_pointer_cast<::arrow::HalfFloatArray>(column->chunk(0)); + ASSERT_TRUE(chunk->IsNull(0)); + for (int32_t i = 0; i < tc.len - 1; ++i) { + const auto expected = tc.vals[i]; + const auto actual = Float16::FromBits(chunk->Value(i + 1)); + if (expected.is_nan()) { + // NaN representations aren't guaranteed to be exact on a binary level + ASSERT_TRUE(actual.is_nan()); + } else { + ASSERT_EQ(expected.bits(), actual.bits()); + } + } + } +} + // direct-as-possible translation of // pyarrow/tests/test_parquet.py::test_validate_schema_write_table TEST(TestArrowWriterAdHoc, SchemaMismatch) { diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index e45cd23f784aa..89b685a64c311 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit e45cd23f784aab3d6bf0701f8f4e621469ed3be7 +Subproject commit 89b685a64c3117b3023d8684af1f41400841db71 diff --git a/go/parquet/pqarrow/file_reader_test.go b/go/parquet/pqarrow/file_reader_test.go index 9c1b4252f5fc6..0c52eec9e3459 100644 --- a/go/parquet/pqarrow/file_reader_test.go +++ b/go/parquet/pqarrow/file_reader_test.go @@ -29,6 +29,7 @@ import ( "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/array" "github.com/apache/arrow/go/v15/arrow/decimal128" + "github.com/apache/arrow/go/v15/arrow/float16" "github.com/apache/arrow/go/v15/arrow/memory" "github.com/apache/arrow/go/v15/parquet" "github.com/apache/arrow/go/v15/parquet/file" @@ -100,6 +101,72 @@ func TestArrowReaderAdHocReadDecimals(t *testing.T) { } } +func TestArrowReaderAdHocReadFloat16s(t *testing.T) { + tests := []struct { + file string + len int + vals []float16.Num + }{ + {"float16_nonzeros_and_nans", 8, + []float16.Num{ + float16.New(1.0), + float16.New(-2.0), + float16.NaN(), + float16.New(0.0), + float16.New(-1.0), + float16.New(0.0).Negate(), + float16.New(2.0), + }}, + {"float16_zeros_and_nans", 3, + []float16.Num{ + float16.New(0.0), + float16.NaN(), + }}, + } + + dataDir := getDataDir() + for _, tt := range tests { + t.Run(tt.file, func(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + filename := filepath.Join(dataDir, tt.file+".parquet") + require.FileExists(t, filename) + + rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem))) + require.NoError(t, err) + defer rdr.Close() + + arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem) + require.NoError(t, err) + + tbl, err := arrowRdr.ReadTable(context.Background()) + require.NoError(t, err) + defer tbl.Release() + + assert.EqualValues(t, 1, tbl.NumCols()) + assert.Truef(t, arrow.TypeEqual(tbl.Schema().Field(0).Type, &arrow.Float16Type{}), "expected: %s\ngot: %s", tbl.Schema().Field(0).Type, arrow.Float16Type{}) + + valCol := tbl.Column(0) + assert.EqualValues(t, tt.len, valCol.Len()) + assert.Len(t, valCol.Data().Chunks(), 1) + + chunk := valCol.Data().Chunk(0).(*array.Float16) + assert.True(t, chunk.IsNull(0)) + for i := 0; i < tt.len-1; i++ { + expected := tt.vals[i] + actual := chunk.Value(i + 1) + if expected.IsNaN() { + // NaN representations aren't guaranteed to be exact on a binary level + assert.True(t, actual.IsNaN()) + } else { + assert.Equal(t, expected.Uint16(), actual.Uint16()) + } + } + }) + } +} + func TestRecordReaderParallel(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0)