From fde2f6af3f791d26d813b141aec7a91029ccab16 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 17 Dec 2024 17:51:07 +0100 Subject: [PATCH 1/5] [C++][Python] Fix ORC crash when file contains unknown timezone Our ORC adapter would let a C++ exception slip through instead of converting it into a Status error. --- python/pyarrow/tests/test_orc.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 1b467d523304c..dc0931fd90d9e 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -15,9 +15,12 @@ # specific language governing permissions and limitations # under the License. -import pytest import decimal import datetime +import subprocess +import sys + +import pytest import pyarrow as pa from pyarrow import fs @@ -140,6 +143,33 @@ def test_example_using_json(filename, datadir): check_example_file(path, table, need_fix=True) +def test_unknown_timezone(datadir): + # Example file relies on the timezone "US/Pacific". It should gracefully + # fail, not crash, if the timezone is not found. + path = datadir / 'TestOrcFile.testDate1900.orc' + code = f"""if 1: + import os + import re + os.environ['TZDIR'] = '/tmp/non_existent' + + from pyarrow import orc + orc_file = orc.ORCFile({str(path)!r}) + try: + orc_file.read() + except Exception as e: + assert re.search("Time zone file .* does not exist", str(e)), e + else: + assert False, "Should have raised exception" + try: + orc_file.read_stripe(0) + except Exception as e: + assert re.search("Time zone file .* does not exist", str(e)), e + else: + assert False, "Should have raised exception" + """ + subprocess.run([sys.executable, "-c", code], check=True) + + def test_orcfile_empty(datadir): from pyarrow import orc From 94538a3a594bf76f3e7c6b1d284378c7e979eb04 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 17 Dec 2024 18:06:33 +0100 Subject: [PATCH 2/5] Make test more lenient --- python/pyarrow/tests/test_orc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index dc0931fd90d9e..24ec5fb307e0d 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -157,13 +157,13 @@ def test_unknown_timezone(datadir): try: orc_file.read() except Exception as e: - assert re.search("Time zone file .* does not exist", str(e)), e + assert "time zone" in e.lower(), e else: assert False, "Should have raised exception" try: orc_file.read_stripe(0) except Exception as e: - assert re.search("Time zone file .* does not exist", str(e)), e + assert "time zone" in e.lower(), e else: assert False, "Should have raised exception" """ From 32d593b6a919035127ff0b76371761192e5190ed Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 17 Dec 2024 18:20:15 +0100 Subject: [PATCH 3/5] Apply fix --- cpp/src/arrow/adapters/orc/adapter.cc | 16 ++++++++-------- python/pyarrow/tests/test_orc.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index d16b6cfd2e97d..51cca497485ce 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -145,7 +145,10 @@ class OrcStripeReader : public RecordBatchReader { Status ReadNext(std::shared_ptr* out) override { std::unique_ptr batch; - ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_)); + std::unique_ptr builder; + + ORC_BEGIN_CATCH_NOT_OK + batch = row_reader_->createRowBatch(batch_size_); const liborc::Type& type = row_reader_->getSelectedType(); if (!row_reader_->next(*batch)) { @@ -153,10 +156,8 @@ class OrcStripeReader : public RecordBatchReader { return Status::OK(); } - std::unique_ptr builder; ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema_, pool_, batch->numElements)); - // The top-level type must be a struct to read into an arrow table const auto& struct_batch = checked_cast(*batch); @@ -164,9 +165,9 @@ class OrcStripeReader : public RecordBatchReader { RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0, batch->numElements, builder->GetField(i))); } + ORC_END_CATCH_NOT_OK - ARROW_ASSIGN_OR_RAISE(*out, builder->Flush()); - return Status::OK(); + return builder->Flush().Value(out); } private: @@ -470,15 +471,13 @@ class ORCFileReader::Impl { int64_t nrows) { std::unique_ptr row_reader; std::unique_ptr batch; + std::unique_ptr builder; ORC_BEGIN_CATCH_NOT_OK row_reader = reader_->createRowReader(opts); batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch)); - ORC_END_CATCH_NOT_OK - std::unique_ptr builder; ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema, pool_, nrows)); - // The top-level type must be a struct to read into an arrow table const auto& struct_batch = checked_cast(*batch); @@ -489,6 +488,7 @@ class ORCFileReader::Impl { batch->numElements, builder->GetField(i))); } } + ORC_END_CATCH_NOT_OK return builder->Flush(); } diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 24ec5fb307e0d..34ba858d67a4d 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -143,9 +143,9 @@ def test_example_using_json(filename, datadir): check_example_file(path, table, need_fix=True) -def test_unknown_timezone(datadir): +def test_timezone_database_absent(datadir): # Example file relies on the timezone "US/Pacific". It should gracefully - # fail, not crash, if the timezone is not found. + # fail, not crash, if the timezone database is not found. path = datadir / 'TestOrcFile.testDate1900.orc' code = f"""if 1: import os From 930929181394d6cc2fe31634df3d7de1a5e4d2ee Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 17 Dec 2024 18:46:35 +0100 Subject: [PATCH 4/5] Add another test for absent timezone --- python/pyarrow/tests/test_orc.py | 36 ++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 34ba858d67a4d..1ea97a4c2d723 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -17,6 +17,8 @@ import decimal import datetime +from pathlib import Path +import shutil import subprocess import sys @@ -149,21 +151,43 @@ def test_timezone_database_absent(datadir): path = datadir / 'TestOrcFile.testDate1900.orc' code = f"""if 1: import os - import re os.environ['TZDIR'] = '/tmp/non_existent' from pyarrow import orc - orc_file = orc.ORCFile({str(path)!r}) try: - orc_file.read() + orc_file = orc.ORCFile({str(path)!r}) except Exception as e: - assert "time zone" in e.lower(), e + assert "time zone database" in str(e).lower(), e else: assert False, "Should have raised exception" + """ + subprocess.run([sys.executable, "-c", code], check=True) + + +def test_timezone_absent(datadir, tmpdir): + # Example file relies on the timezone "US/Pacific". It should gracefully + # fail, not crash, if the timezone database is present but the timezone + # is not found (GH-40633). + source_tzdir = Path('/usr/share/zoneinfo') + if not source_tzdir.exists(): + pytest.skip(f"Test needs timezone database in {source_tzdir}") + tzdir = Path(tmpdir / 'zoneinfo') + shutil.copytree(source_tzdir, tzdir, symlinks=True) + (tzdir / 'US' / 'Pacific').unlink(missing_ok=True) + + path = datadir / 'TestOrcFile.testDate1900.orc' + code = f"""if 1: + import os + import re + os.environ['TZDIR'] = {str(tzdir)!r} + + from pyarrow import orc + orc_file = orc.ORCFile({str(path)!r}) try: - orc_file.read_stripe(0) + orc_file.read() except Exception as e: - assert "time zone" in e.lower(), e + assert re.search( + "Can't open .*/zoneinfo/US/Pacific", str(e), flags=re.I), e else: assert False, "Should have raised exception" """ From 1251958104be1dbe0e61d6215878dff32a855e32 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 17 Dec 2024 19:08:01 +0100 Subject: [PATCH 5/5] Relax test --- python/pyarrow/tests/test_orc.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 1ea97a4c2d723..b0f9e813b103d 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -156,6 +156,7 @@ def test_timezone_database_absent(datadir): from pyarrow import orc try: orc_file = orc.ORCFile({str(path)!r}) + orc_file.read() except Exception as e: assert "time zone database" in str(e).lower(), e else: @@ -172,13 +173,15 @@ def test_timezone_absent(datadir, tmpdir): if not source_tzdir.exists(): pytest.skip(f"Test needs timezone database in {source_tzdir}") tzdir = Path(tmpdir / 'zoneinfo') - shutil.copytree(source_tzdir, tzdir, symlinks=True) + try: + shutil.copytree(source_tzdir, tzdir, symlinks=True) + except OSError as e: + pytest.skip(f"Failed to copy timezone database: {e}") (tzdir / 'US' / 'Pacific').unlink(missing_ok=True) path = datadir / 'TestOrcFile.testDate1900.orc' code = f"""if 1: import os - import re os.environ['TZDIR'] = {str(tzdir)!r} from pyarrow import orc @@ -186,8 +189,7 @@ def test_timezone_absent(datadir, tmpdir): try: orc_file.read() except Exception as e: - assert re.search( - "Can't open .*/zoneinfo/US/Pacific", str(e), flags=re.I), e + assert "zoneinfo/US/Pacific" in str(e), e else: assert False, "Should have raised exception" """