Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix parsing of ODF time values with comments #55324

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ Bug fixes
- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`)
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
- Bug in :meth:`pandas.read_excel` where ODS files with comments on time value cells failed to parse (:issue:`55200`)
- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`)

Categorical
Expand Down
37 changes: 35 additions & 2 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import datetime
import re
from typing import (
TYPE_CHECKING,
cast,
Expand All @@ -26,6 +28,12 @@

from pandas._libs.tslibs.nattype import NaTType

# ODF variant of ISO 8601 time/duration format: "PThhhHmmMss.sssS"
# see https://www.w3.org/TR/xmlschema-2/#duration for details
ODF_ISOTIME_PATTERN = re.compile(
r"^\s*PT\s*(\d+)\s*H\s*(\d+)\s*M\s*(\d+)(\.(\d+))?\s*S$"
)


@doc(storage_options=_shared_docs["storage_options"])
class ODFReader(BaseExcelReader["OpenDocument"]):
Expand Down Expand Up @@ -214,9 +222,9 @@ def _get_cell_value(self, cell) -> Scalar | NaTType:
cell_value = cell.attributes.get((OFFICENS, "date-value"))
return pd.Timestamp(cell_value)
elif cell_type == "time":
stamp = pd.Timestamp(str(cell))
stamp = self._get_cell_time_value(cell)
# cast needed here because Scalar doesn't include datetime.time
return cast(Scalar, stamp.time())
return cast(Scalar, stamp)
else:
self.close()
raise ValueError(f"Unrecognized type {cell_type}")
Expand Down Expand Up @@ -247,3 +255,28 @@ def _get_cell_string_value(self, cell) -> str:
else:
value.append(str(fragment).strip("\n"))
return "".join(value)

def _get_cell_time_value(self, cell) -> datetime.time:
"""
This helper function parses ODF time value
"""
from odf.namespaces import OFFICENS

value = cell.attributes.get((OFFICENS, "time-value"))
parts = ODF_ISOTIME_PATTERN.match(value)
if parts is None:
raise ValueError(f"Failed to parse ODF time value: {value}")
hours, minutes, seconds, _, second_part = parts.group(*range(1, 6))
if second_part is None:
microseconds = 0
else:
microseconds = int(int(second_part) * pow(10, 6 - len(second_part)))

return datetime.time(
# ignore date part from some representations
# and datetime.time restrict hour values to 0..23
hour=int(hours) % 24,
minute=int(minutes),
second=int(seconds),
microsecond=microseconds,
)
Binary file not shown.
Binary file modified pandas/tests/io/data/excel/times_1900.ods
Binary file not shown.
Binary file modified pandas/tests/io/data/excel/times_1904.ods
Binary file not shown.
5 changes: 5 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,11 @@ def test_reader_seconds(self, request, engine, read_ext):
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
tm.assert_frame_equal(actual, expected)

if read_ext == ".ods":
msg = "Failed to parse ODF time value: PT01H5a2M00S"
with pytest.raises(ValueError, match=msg):
pd.read_excel("test_corrupted_time" + read_ext)

def test_read_excel_multiindex(self, request, engine, read_ext):
# see gh-4679
if engine == "pyxlsb":
Expand Down