Skip to content

Commit

Permalink
GH-38751: [C++][Go][Parquet] Add tests for reading Float16 files in p…
Browse files Browse the repository at this point in the history
…arquet-testing (#38753)

### Rationale for this change

Validates compatibility between implementations when reading `Float16` columns.

### What changes are included in this PR?

- Bumps `parquet-testing` commit to latest to use the recently-added files
- Adds reader tests for C++ and Go in the same vein as apache/arrow-rs#5003

### Are these changes tested?

Yes

### Are there any user-facing changes?

No

* Closes: #38751

Authored-by: benibus <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
  • Loading branch information
benibus authored Nov 17, 2023
1 parent c353c81 commit 26cf0e0
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 1 deletion.
54 changes: 54 additions & 0 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4099,6 +4099,60 @@ INSTANTIATE_TEST_SUITE_P(
std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2)),
std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 2))));

TEST(TestArrowReaderAdHoc, ReadFloat16Files) {
using ::arrow::util::Float16;
constexpr auto nan = std::numeric_limits<Float16>::quiet_NaN();

struct TestCase {
std::string filename;
int32_t len;
std::vector<Float16> vals;
} test_cases[] = {
{"float16_nonzeros_and_nans",
8,
{Float16(+1.0), Float16(-2.0), nan, Float16(+0.0), Float16(-1.0), Float16(-0.0),
Float16(+2.0)}},
{"float16_zeros_and_nans", 3, {Float16(+0.0), nan}},
};

const auto pool = ::arrow::default_memory_pool();

for (const auto& tc : test_cases) {
std::string path(test::get_data_dir());
path += "/" + tc.filename + ".parquet";
ARROW_SCOPED_TRACE("path = ", path);

std::unique_ptr<FileReader> reader;
ASSERT_OK_NO_THROW(
FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &reader));
std::shared_ptr<::arrow::Table> table;
ASSERT_OK_NO_THROW(reader->ReadTable(&table));

std::shared_ptr<::arrow::Schema> schema;
ASSERT_OK_NO_THROW(reader->GetSchema(&schema));
ASSERT_EQ(1, schema->num_fields());
ASSERT_EQ(schema->field(0)->type()->id(), ::arrow::Type::HALF_FLOAT);

ASSERT_EQ(1, table->num_columns());
auto column = table->column(0);
ASSERT_EQ(tc.len, column->length());
ASSERT_EQ(1, column->num_chunks());

auto chunk = checked_pointer_cast<::arrow::HalfFloatArray>(column->chunk(0));
ASSERT_TRUE(chunk->IsNull(0));
for (int32_t i = 0; i < tc.len - 1; ++i) {
const auto expected = tc.vals[i];
const auto actual = Float16::FromBits(chunk->Value(i + 1));
if (expected.is_nan()) {
// NaN representations aren't guaranteed to be exact on a binary level
ASSERT_TRUE(actual.is_nan());
} else {
ASSERT_EQ(expected.bits(), actual.bits());
}
}
}
}

// direct-as-possible translation of
// pyarrow/tests/test_parquet.py::test_validate_schema_write_table
TEST(TestArrowWriterAdHoc, SchemaMismatch) {
Expand Down
67 changes: 67 additions & 0 deletions go/parquet/pqarrow/file_reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/apache/arrow/go/v15/arrow"
"github.com/apache/arrow/go/v15/arrow/array"
"github.com/apache/arrow/go/v15/arrow/decimal128"
"github.com/apache/arrow/go/v15/arrow/float16"
"github.com/apache/arrow/go/v15/arrow/memory"
"github.com/apache/arrow/go/v15/parquet"
"github.com/apache/arrow/go/v15/parquet/file"
Expand Down Expand Up @@ -100,6 +101,72 @@ func TestArrowReaderAdHocReadDecimals(t *testing.T) {
}
}

func TestArrowReaderAdHocReadFloat16s(t *testing.T) {
tests := []struct {
file string
len int
vals []float16.Num
}{
{"float16_nonzeros_and_nans", 8,
[]float16.Num{
float16.New(1.0),
float16.New(-2.0),
float16.NaN(),
float16.New(0.0),
float16.New(-1.0),
float16.New(0.0).Negate(),
float16.New(2.0),
}},
{"float16_zeros_and_nans", 3,
[]float16.Num{
float16.New(0.0),
float16.NaN(),
}},
}

dataDir := getDataDir()
for _, tt := range tests {
t.Run(tt.file, func(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(t, 0)

filename := filepath.Join(dataDir, tt.file+".parquet")
require.FileExists(t, filename)

rdr, err := file.OpenParquetFile(filename, false, file.WithReadProps(parquet.NewReaderProperties(mem)))
require.NoError(t, err)
defer rdr.Close()

arrowRdr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, mem)
require.NoError(t, err)

tbl, err := arrowRdr.ReadTable(context.Background())
require.NoError(t, err)
defer tbl.Release()

assert.EqualValues(t, 1, tbl.NumCols())
assert.Truef(t, arrow.TypeEqual(tbl.Schema().Field(0).Type, &arrow.Float16Type{}), "expected: %s\ngot: %s", tbl.Schema().Field(0).Type, arrow.Float16Type{})

valCol := tbl.Column(0)
assert.EqualValues(t, tt.len, valCol.Len())
assert.Len(t, valCol.Data().Chunks(), 1)

chunk := valCol.Data().Chunk(0).(*array.Float16)
assert.True(t, chunk.IsNull(0))
for i := 0; i < tt.len-1; i++ {
expected := tt.vals[i]
actual := chunk.Value(i + 1)
if expected.IsNaN() {
// NaN representations aren't guaranteed to be exact on a binary level
assert.True(t, actual.IsNaN())
} else {
assert.Equal(t, expected.Uint16(), actual.Uint16())
}
}
})
}
}

func TestRecordReaderParallel(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(t, 0)
Expand Down

0 comments on commit 26cf0e0

Please sign in to comment.