From 07e0bca0a6e2005b6fc31110f28c32e606df288d Mon Sep 17 00:00:00 2001 From: easternsun7 <165460574+easternsun7@users.noreply.github.com> Date: Tue, 10 Dec 2024 02:31:40 +0800 Subject: [PATCH 01/13] Update frame.rst (#60525) Fix the navigation bar --- doc/source/reference/frame.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 7680c8b434866..e701d48a89db7 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -185,7 +185,6 @@ Reindexing / selection / label manipulation DataFrame.duplicated DataFrame.equals DataFrame.filter - DataFrame.head DataFrame.idxmax DataFrame.idxmin DataFrame.reindex @@ -196,7 +195,6 @@ Reindexing / selection / label manipulation DataFrame.sample DataFrame.set_axis DataFrame.set_index - DataFrame.tail DataFrame.take DataFrame.truncate From 59f947ff40308bcfb6ecb65eb23b391d6f031c03 Mon Sep 17 00:00:00 2001 From: Michelino Gali <107483586+migelogali@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:32:30 -0500 Subject: [PATCH 02/13] updated v to conv_val in that function (#60518) --- pandas/core/computation/pytables.py | 56 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index fe7e27f537b01..4a75acce46632 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -205,7 +205,7 @@ def generate(self, v) -> str: val = v.tostring(self.encoding) return f"({self.lhs} {self.op} {val})" - def convert_value(self, v) -> TermValue: + def convert_value(self, conv_val) -> TermValue: """ convert the expression that is in the term to something that is accepted by pytables @@ -219,44 +219,44 @@ def stringify(value): kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) if kind == "datetime" or (kind and kind.startswith("datetime64")): - if isinstance(v, (int, float)): - v = stringify(v) - v = ensure_decoded(v) - v = Timestamp(v).as_unit("ns") - if v.tz is not None: - v = v.tz_convert("UTC") - return TermValue(v, v._value, kind) + if isinstance(conv_val, (int, float)): + conv_val = stringify(conv_val) + conv_val = ensure_decoded(conv_val) + conv_val = Timestamp(conv_val).as_unit("ns") + if conv_val.tz is not None: + conv_val = conv_val.tz_convert("UTC") + return TermValue(conv_val, conv_val._value, kind) elif kind in ("timedelta64", "timedelta"): - if isinstance(v, str): - v = Timedelta(v) + if isinstance(conv_val, str): + conv_val = Timedelta(conv_val) else: - v = Timedelta(v, unit="s") - v = v.as_unit("ns")._value - return TermValue(int(v), v, kind) + conv_val = Timedelta(conv_val, unit="s") + conv_val = conv_val.as_unit("ns")._value + return TermValue(int(conv_val), conv_val, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) result: npt.NDArray[np.intp] | np.intp | int - if v not in metadata: + if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(v, side="left") + result = metadata.searchsorted(conv_val, side="left") return TermValue(result, result, "integer") elif kind == "integer": try: - v_dec = Decimal(v) + v_dec = Decimal(conv_val) except InvalidOperation: # GH 54186 # convert v to float to raise float's ValueError - float(v) + float(conv_val) else: - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) - return TermValue(v, v, kind) + conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + return TermValue(conv_val, conv_val, kind) elif kind == "float": - v = float(v) - return TermValue(v, v, kind) + conv_val = float(conv_val) + return TermValue(conv_val, conv_val, kind) elif kind == "bool": - if isinstance(v, str): - v = v.strip().lower() not in [ + if isinstance(conv_val, str): + conv_val = conv_val.strip().lower() not in [ "false", "f", "no", @@ -268,13 +268,13 @@ def stringify(value): "", ] else: - v = bool(v) - return TermValue(v, v, kind) - elif isinstance(v, str): + conv_val = bool(conv_val) + return TermValue(conv_val, conv_val, kind) + elif isinstance(conv_val, str): # string quoting - return TermValue(v, stringify(v), "string") + return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column") def convert_values(self) -> None: pass From 05f7ef9a2128ca04939f30840e86b38ec490c617 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:35:22 +0100 Subject: [PATCH 03/13] BUG: Fix `ListAccessor` methods to preserve original name (#60527) * fix: preserve series name in ListAccessor * formatting * add whatsnew v3.0.0 entry --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/accessors.py | 24 +++++++++++++++---- .../series/accessors/test_list_accessor.py | 18 +++++++++++--- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab5746eca1b18..b799b7ea5cb39 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -798,6 +798,7 @@ Other - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 230522846d377..b220a94d032b5 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -117,7 +117,10 @@ def len(self) -> Series: value_lengths = pc.list_value_length(self._pa_array) return Series( - value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + value_lengths, + dtype=ArrowDtype(value_lengths.type), + index=self._data.index, + name=self._data.name, ) def __getitem__(self, key: int | slice) -> Series: @@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) return Series( - element, dtype=ArrowDtype(element.type), index=self._data.index + element, + dtype=ArrowDtype(element.type), + index=self._data.index, + name=self._data.name, ) elif isinstance(key, slice): if pa_version_under11p0: @@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) + return Series( + sliced, + dtype=ArrowDtype(sliced.type), + index=self._data.index, + name=self._data.name, + ) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -223,7 +234,12 @@ def flatten(self) -> Series: counts = pa.compute.list_value_length(self._pa_array) flattened = pa.compute.list_flatten(self._pa_array) index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) - return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) + return Series( + flattened, + dtype=ArrowDtype(flattened.type), + index=index, + name=self._data.name, + ) class StructAccessor(ArrowAccessor): diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index c153e800cb534..bec8ca13a2f5f 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -25,9 +25,10 @@ def test_list_getitem(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]") + expected = Series([2, None, None], dtype="int64[pyarrow]", name="a") tm.assert_series_equal(actual, expected) @@ -37,9 +38,15 @@ def test_list_getitem_index(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + expected = Series( + [2, None, None], + dtype="int64[pyarrow]", + index=[1, 3, 7], + name="a", + ) tm.assert_series_equal(actual, expected) @@ -48,6 +55,7 @@ def test_list_getitem_slice(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) if pa_version_under11p0: with pytest.raises( @@ -60,6 +68,7 @@ def test_list_getitem_slice(): [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) tm.assert_series_equal(actual, expected) @@ -68,9 +77,10 @@ def test_list_len(): ser = Series( [[1, 2, 3], [4, None], None], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.len() - expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32()), name="a") tm.assert_series_equal(actual, expected) @@ -78,12 +88,14 @@ def test_list_flatten(): ser = Series( [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.flatten() expected = Series( [1, 2, 3, 4, None, 7, 8], dtype=ArrowDtype(pa.int64()), index=[0, 0, 0, 2, 2, 4, 4], + name="a", ) tm.assert_series_equal(actual, expected) From e6e1987b988857bb511d3797400b4d1873e86760 Mon Sep 17 00:00:00 2001 From: Wong2333 <3201884732@qq.com> Date: Tue, 10 Dec 2024 02:37:04 +0800 Subject: [PATCH 04/13] DOC: Update variables a and b to names consistent with comment documentation (#60526) * DOC: Fix title capitalization in documentation file * DOC: Fix title capitalization in documentation files * Update variables a and b to names consistent with comment documentation --- pandas/core/computation/expressions.py | 70 +++++++++++++------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index e2acd9a2c97c2..a2c3a706ae29c 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None: ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b): +def _evaluate_standard(op, op_str, left_op, right_op): """ Standard evaluation. """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + return op(left_op, right_op) -def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: - """return a boolean if we WILL be using numexpr""" +def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool: + """return left_op boolean if we WILL be using numexpr""" if op_str is not None: # required min elements (otherwise we are adding overhead) - if a.size > _MIN_ELEMENTS: + if left_op.size > _MIN_ELEMENTS: # check for dtype compatibility dtypes: set[str] = set() - for o in [a, b]: + for o in [left_op, right_op]: # ndarray and Series Case if hasattr(o, "dtype"): dtypes |= {o.dtype.name} @@ -93,22 +93,22 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: return False -def _evaluate_numexpr(op, op_str, a, b): +def _evaluate_numexpr(op, op_str, left_op, right_op): result = None - if _can_use_numexpr(op, op_str, a, b, "evaluate"): + if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"): is_reversed = op.__name__.strip("_").startswith("r") if is_reversed: # we were originally called by a reversed op method - a, b = b, a + left_op, right_op = right_op, left_op - a_value = a - b_value = b + left_value = left_op + right_value = right_op try: result = ne.evaluate( - f"a_value {op_str} b_value", - local_dict={"a_value": a_value, "b_value": b_value}, + f"left_value {op_str} right_value", + local_dict={"left_value": left_value, "right_value": right_op}, casting="safe", ) except TypeError: @@ -116,20 +116,20 @@ def _evaluate_numexpr(op, op_str, a, b): # (https://github.com/pydata/numexpr/issues/379) pass except NotImplementedError: - if _bool_arith_fallback(op_str, a, b): + if _bool_arith_fallback(op_str, left_op, right_op): pass else: raise if is_reversed: # reverse order to original for fallback - a, b = b, a + left_op, right_op = right_op, left_op if _TEST_MODE: _store_test_result(result is not None) if result is None: - result = _evaluate_standard(op, op_str, a, b) + result = _evaluate_standard(op, op_str, left_op, right_op) return result @@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b): } -def _where_standard(cond, a, b): +def _where_standard(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary - return np.where(cond, a, b) + return np.where(cond, left_op, right_op) -def _where_numexpr(cond, a, b): +def _where_numexpr(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary result = None - if _can_use_numexpr(None, "where", a, b, "where"): + if _can_use_numexpr(None, "where", left_op, right_op, "where"): result = ne.evaluate( "where(cond_value, a_value, b_value)", - local_dict={"cond_value": cond, "a_value": a, "b_value": b}, + local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op}, casting="safe", ) if result is None: - result = _where_standard(cond, a, b) + result = _where_standard(cond, left_op, right_op) return result @@ -206,13 +206,13 @@ def _has_bool_dtype(x): _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} -def _bool_arith_fallback(op_str, a, b) -> bool: +def _bool_arith_fallback(op_str, left_op, right_op) -> bool: """ Check if we should fallback to the python `_evaluate_standard` in case of an unsupported operation by numexpr, which is the case for some boolean ops. """ - if _has_bool_dtype(a) and _has_bool_dtype(b): + if _has_bool_dtype(left_op) and _has_bool_dtype(right_op): if op_str in _BOOL_OP_UNSUPPORTED: warnings.warn( f"evaluating in Python space because the {op_str!r} " @@ -224,15 +224,15 @@ def _bool_arith_fallback(op_str, a, b) -> bool: return False -def evaluate(op, a, b, use_numexpr: bool = True): +def evaluate(op, left_op, right_op, use_numexpr: bool = True): """ - Evaluate and return the expression of the op on a and b. + Evaluate and return the expression of the op on left_op and right_op. Parameters ---------- op : the actual operand - a : left operand - b : right operand + left_op : left operand + right_op : right operand use_numexpr : bool, default True Whether to try to use numexpr. """ @@ -240,24 +240,24 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: if use_numexpr: # error: "None" not callable - return _evaluate(op, op_str, a, b) # type: ignore[misc] - return _evaluate_standard(op, op_str, a, b) + return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc] + return _evaluate_standard(op, op_str, left_op, right_op) -def where(cond, a, b, use_numexpr: bool = True): +def where(cond, left_op, right_op, use_numexpr: bool = True): """ - Evaluate the where condition cond on a and b. + Evaluate the where condition cond on left_op and right_op. Parameters ---------- cond : np.ndarray[bool] - a : return if cond is True - b : return if cond is False + left_op : return if cond is True + right_op : return if cond is False use_numexpr : bool, default True Whether to try to use numexpr. """ assert _where is not None - return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) + return _where(cond, left_op, right_op) if use_numexpr else _where_standard(cond, left_op, right_op) def set_test_mode(v: bool = True) -> None: From 2d774e7f3e54ff94b03c7500c5ec756b16e47d10 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Tue, 10 Dec 2024 02:37:57 +0800 Subject: [PATCH 05/13] DOC: fix broken link in Resampler.bfill (#60524) --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fdfb9f21bdb9f..0d1541bbb3afa 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -694,7 +694,7 @@ def bfill(self, limit: int | None = None): References ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29 Examples -------- From f3b798545160fc878e87d05947e0180df031ecb6 Mon Sep 17 00:00:00 2001 From: sunlight <138234530+sunlight798@users.noreply.github.com> Date: Tue, 10 Dec 2024 02:38:39 +0800 Subject: [PATCH 06/13] DOC: Fix docstrings for errors (#60523) * DOC: Fix docstrings for errors * DOC: Fix docstrings for errors --- ci/code_checks.sh | 3 --- pandas/errors/__init__.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index adc5bc9a01bdd..7bc220acdd74c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -95,9 +95,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 1de6f06ef316c..cd31ec30522c3 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -45,6 +45,11 @@ class NullFrequencyError(ValueError): Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``, ``PeriodIndex.shift``. + See Also + -------- + Index.shift : Shift values of Index. + Series.shift : Shift values of Series. + Examples -------- >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) @@ -58,6 +63,12 @@ class PerformanceWarning(Warning): """ Warning raised when there is a possible performance impact. + See Also + -------- + DataFrame.set_index : Set the DataFrame index using existing columns. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- >>> df = pd.DataFrame( @@ -385,6 +396,13 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- >>> df = pd.DataFrame( From b667fdf8dd4e1ea8bf2e001fbfe23beeb4735a51 Mon Sep 17 00:00:00 2001 From: Aditya Ghosh <72292940+Nanashi-bot@users.noreply.github.com> Date: Tue, 10 Dec 2024 00:10:54 +0530 Subject: [PATCH 07/13] Add extended summary for fullmatch, match, pad, repeat, slice and slice_replace (#60520) Add extended summary for fullmatch, match, pad, repeat, slice and slice_replace functions --- pandas/core/strings/accessor.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 05e1a36877e06..c68b6303661b9 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1374,6 +1374,11 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. + Determines whether each string in the Series or Index starts with a + match to a specified regular expression. This function is especially + useful for validating prefixes, such as ensuring that codes, tags, or + identifiers begin with a specific pattern. + Parameters ---------- pat : str @@ -1419,6 +1424,11 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. + Checks if each string in the Series or Index fully matches the + specified regular expression pattern. This function is useful when the + requirement is for an entire string to conform to a pattern, such as + validating formats like phone numbers or email addresses. + Parameters ---------- pat : str @@ -1647,6 +1657,10 @@ def repeat(self, repeats): """ Duplicate each string in the Series or Index. + Duplicates each string in the Series or Index, either by applying the + same repeat count to all elements or by using different repeat values + for each element. + Parameters ---------- repeats : int or sequence of int @@ -1710,6 +1724,12 @@ def pad( """ Pad strings in the Series/Index up to width. + This function pads strings in a Series or Index to a specified width, + filling the extra space with a character of your choice. It provides + flexibility in positioning the padding, allowing it to be added to the + left, right, or both sides. This is useful for formatting strings to + align text or ensure consistent string lengths in data processing. + Parameters ---------- width : int @@ -1920,6 +1940,11 @@ def slice(self, start=None, stop=None, step=None): """ Slice substrings from each element in the Series or Index. + Slicing substrings from strings in a Series or Index helps extract + specific portions of data, making it easier to analyze or manipulate + text. This is useful for tasks like parsing structured text fields or + isolating parts of strings with a consistent format. + Parameters ---------- start : int, optional @@ -1996,6 +2021,11 @@ def slice_replace(self, start=None, stop=None, repl=None): """ Replace a positional slice of a string with another value. + This function allows replacing specific parts of a string in a Series + or Index by specifying start and stop positions. It is useful for + modifying substrings in a controlled way, such as updating sections of + text based on their positions or patterns. + Parameters ---------- start : int, optional From 6cbe941c4512b86156eb06a26d253f4aa30b0304 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 9 Dec 2024 10:46:14 -0800 Subject: [PATCH 08/13] BUG: Fix float32 precision issues in pd.to_datetime (#60510) * BUG: Fix float32 precision issues in pd.to_datetime * BUG: Add note to whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/tools/datetimes.py | 5 +++++ pandas/tests/tools/test_to_datetime.py | 12 ++++++++++++ 3 files changed, 18 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b799b7ea5cb39..2013f81d4da18 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -626,6 +626,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`) - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4680a63bf57a1..30487de7bafd5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -44,6 +44,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_float, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -1153,6 +1154,10 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) + # prevent prevision issues in case of float32 # GH#60506 + if is_float_dtype(values.dtype): + values = values.astype("float64") + # prevent overflow in case of int8 or int16 if is_integer_dtype(values.dtype): values = values.astype("int64") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b73839f406a29..74b051aec71a4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2084,6 +2084,18 @@ def test_dataframe_str_dtype(self, df, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_float32_dtype(self, df, cache): + # GH#60506 + # coerce to float64 + result = to_datetime(df.astype(np.float32), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) + tm.assert_series_equal(result, expected) + def test_dataframe_coerce(self, cache): # passing coerce df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) From ca91dd4c39a02c0026b98c16c56996f81506e004 Mon Sep 17 00:00:00 2001 From: jmalp <75514361+jmalp@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:54:40 -0800 Subject: [PATCH 09/13] DOC: fix docstrings validation for pandas.core.groupby.DataFrameGroupBy.boxplot (#60509) * fix docstrings validation for pandas.core.groupby.DataFrameGroupBy.boxplot * fix trailing whitespace * fix the error "pandas.Series.plot in `See Also` section does not need `pandas` prefix, use Series.plot instead." * fix the error "pandas.DataFrame.boxplot in `See Also` section does not need `pandas` prefix, use DataFrame.boxplot instead." --- ci/code_checks.sh | 1 - pandas/plotting/_core.py | 26 +++++++++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7bc220acdd74c..fdaffb5a9c9ef 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.IntervalArray.length SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index fbf9009cedc40..aee872f9ae50a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -570,18 +570,23 @@ def boxplot_frame_groupby( Parameters ---------- - grouped : Grouped DataFrame + grouped : DataFrameGroupBy + The grouped DataFrame object over which to create the box plots. subplots : bool * ``False`` - no subplots will be used * ``True`` - create a subplot for each group. - column : column name or list of names, or vector Can be any valid input to groupby. fontsize : float or str - rot : label rotation angle - grid : Setting this to True will show the grid + Font size for the labels. + rot : float + Rotation angle of labels (in degrees) on the x-axis. + grid : bool + Whether to show grid lines on the plot. ax : Matplotlib axis object, default None - figsize : A tuple (width, height) in inches + The axes on which to draw the plots. If None, uses the current axes. + figsize : tuple of (float, float) + The figure size in inches (width, height). layout : tuple (optional) The layout of the plot: (rows, columns). sharex : bool, default False @@ -599,8 +604,15 @@ def boxplot_frame_groupby( Returns ------- - dict of key/value = group key/DataFrame.boxplot return value - or DataFrame.boxplot return value in case subplots=figures=False + dict or DataFrame.boxplot return value + If ``subplots=True``, returns a dictionary of group keys to the boxplot + return values. If ``subplots=False``, returns the boxplot return value + of a single DataFrame. + + See Also + -------- + DataFrame.boxplot : Create a box plot from a DataFrame. + Series.plot : Plot a Series. Examples -------- From 719fc0fcbcda23a79156ccfc990228df0851452f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:15:34 -0800 Subject: [PATCH 10/13] FIX: ruff checks in expressions/pytables (#60541) * FIX: ruff checks in expressions/pytables * swap condition * more pre-commit --- pandas/core/computation/expressions.py | 7 +++++-- pandas/core/computation/pytables.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index a2c3a706ae29c..5a5fad0d83d7a 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -108,7 +108,7 @@ def _evaluate_numexpr(op, op_str, left_op, right_op): try: result = ne.evaluate( f"left_value {op_str} right_value", - local_dict={"left_value": left_value, "right_value": right_op}, + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: @@ -257,7 +257,10 @@ def where(cond, left_op, right_op, use_numexpr: bool = True): Whether to try to use numexpr. """ assert _where is not None - return _where(cond, left_op, right_op) if use_numexpr else _where_standard(cond, left_op, right_op) + if use_numexpr: + return _where(cond, left_op, right_op) + else: + return _where_standard(cond, left_op, right_op) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 4a75acce46632..166c9d47294cd 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -274,7 +274,9 @@ def stringify(value): # string quoting return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass From 38224dd910e57fef7a3b0f4e85d67d8e690d6897 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:20:10 -0800 Subject: [PATCH 11/13] CI/TST: Use tm.external_error_raised for test_from_arrow_respecting_given_dtype_unsafe (#60544) --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c6ac6368f2770..6dd1f3f15bc15 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1647,7 +1647,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) From 13e2df0d7074cbc1a8d59d7044d5bfcb69147a3d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:11:30 -0800 Subject: [PATCH 12/13] CI: Ignore prompting in test-arm when apt-get installing (#60546) * CI: Ignore prompting in test-arm when apt-get installing * CI: Ignore prompting in test-arm when apt-get installing * Skip the apt-get install all together --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c986e5b1b054..139ea9d220453 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,7 +34,6 @@ jobs: fi python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ci/run_tests.sh test-linux-musl: docker: From c52846ff94d51ce5940928c199da00f403bc8138 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:00:31 -0800 Subject: [PATCH 13/13] TST: filter possible RuntimeWarning in tests (#60553) * Ignore possible RuntimeWarning in _hash_ndarray * Revert "Ignore possible RuntimeWarning in _hash_ndarray" This reverts commit 1c9a763a0a6e7b6ba4dcfd364a3fcb506883ba16. * Just filter warnings instead * Fix typos --- pandas/tests/extension/test_interval.py | 25 +++++++++++++++++++ pandas/tests/frame/methods/test_to_numpy.py | 4 +++ pandas/tests/frame/test_constructors.py | 3 +++ pandas/tests/indexes/interval/test_astype.py | 6 +++++ pandas/tests/indexes/interval/test_formats.py | 3 +++ .../tests/indexes/interval/test_indexing.py | 3 +++ pandas/tests/indexes/test_setops.py | 1 + pandas/tests/io/excel/test_writers.py | 3 +++ pandas/tests/reshape/test_cut.py | 1 + 9 files changed, 49 insertions(+) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index ec979ac6d22dc..011bf0b2016b2 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -101,6 +101,31 @@ def test_fillna_limit_series(self, data_missing): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index d38bc06260a0e..36088cceb13f1 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -31,6 +32,9 @@ def test_to_numpy_copy(self): # and that can be respected because we are already numpy-float assert df.to_numpy(copy=False).base is df.values.base + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3d8213cb3d11a..9b6080603f0c9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2404,6 +2404,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 787461b944bd0..5783a16e81d37 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -340,6 +340,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 5f934ca3e6e83..58b69d79c65ce 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -525,6 +525,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 18948de72200a..ced4feb9e7eb9 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -800,6 +800,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string): # see gh-19242 # diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index d8bb4fba1e1fe..63332fe4658e5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -733,6 +733,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424