From f8bd9886e0eba6142e89db7af5a91deb50497b75 Mon Sep 17 00:00:00 2001 From: abonte <6319051+abonte@users.noreply.github.com> Date: Tue, 16 Apr 2024 19:12:19 +0200 Subject: [PATCH 1/5] DOC: replace deprecated frequency alias (#58256) replace deprecated alias --- pandas/core/arrays/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f4f076103d8c3b..8ada9d88e08bc0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1787,7 +1787,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: ---------- freq : str or Offset The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See + frequency like 's' (second) not 'ME' (month end). See :ref:`frequency aliases ` for a list of possible `freq` values. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' From 53bd1a83a987cad854f53db48a4d472dfeffdced Mon Sep 17 00:00:00 2001 From: Thomas H Date: Tue, 16 Apr 2024 13:16:02 -0400 Subject: [PATCH 2/5] BUG: DataFrame slice selection treated as hashable in Python 3.12 #57500 (#58043) * Reorder slice and hashable in __getitem__ * Add unit test * Fix test and formatting * Update whatsnew * Restore original flow ordering * Move whatsnew entry to 3.0.0 * Move whatsnew entry to Indexing * Update doc/source/whatsnew/v3.0.0.rst --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/frame.py | 4 +++- pandas/tests/frame/indexing/test_indexing.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 17328e6084cb48..0992142f563638 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -386,7 +386,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Missing diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0b386efb5a867d..cd4812c3f78ae7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3855,8 +3855,10 @@ def __getitem__(self, key): key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) - if is_hashable(key) and not is_iterator(key): + if is_hashable(key) and not is_iterator(key) and not isinstance(key, slice): # is_iterator to exclude generator e.g. test_getitem_listlike + # As of Python 3.12, slice is hashable which breaks MultiIndex (GH#57500) + # shortcut if the key is in columns is_mi = isinstance(self.columns, MultiIndex) # GH#45316 Return view if key is not duplicated diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 49e5c4aff5afe6..5a6fe07aa007b0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -524,6 +524,16 @@ def test_loc_setitem_boolean_mask_allfalse(self): result.loc[result.b.isna(), "a"] = result.a.copy() tm.assert_frame_equal(result, df) + def test_getitem_slice_empty(self): + df = DataFrame([[1]], columns=MultiIndex.from_product([["A"], ["a"]])) + result = df[:] + + expected = DataFrame([[1]], columns=MultiIndex.from_product([["A"], ["a"]])) + + tm.assert_frame_equal(result, expected) + # Ensure df[:] returns a view of df, not the same object + assert result is not df + def test_getitem_fancy_slice_integers_step(self): df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) From b1dbd3bc1744d148fcc67aa807917bf6825470d3 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Tue, 16 Apr 2024 20:17:01 +0300 Subject: [PATCH 3/5] GH: PDEP vote issue template (#58204) * Create pdep_vote.yaml * Unindent validations * PDEP voting issue template * Update pdeps path * Minor changes * Update label name * Better wording * Remove placeholder * ignore:: * Remove wrong files --------- Co-authored-by: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/pdep_vote.yaml | 74 +++++++++++++++++++ .../pdeps/0001-purpose-and-guidelines.md | 4 +- 2 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/pdep_vote.yaml diff --git a/.github/ISSUE_TEMPLATE/pdep_vote.yaml b/.github/ISSUE_TEMPLATE/pdep_vote.yaml new file mode 100644 index 00000000000000..6dcbd76eb0f74b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/pdep_vote.yaml @@ -0,0 +1,74 @@ +name: PDEP Vote +description: Call for a vote on a PDEP +title: "VOTE: " +labels: [Vote] + +body: + - type: markdown + attributes: + value: > + As per [PDEP-1](https://pandas.pydata.org/pdeps/0001-purpose-and-guidelines.html), the following issue template should be used when a + maintainer has opened a PDEP discussion and is ready to call for a vote. + - type: checkboxes + attributes: + label: Locked issue + options: + - label: > + I locked this voting issue so that only voting members are able to cast their votes or + comment on this issue. + required: true + - type: input + id: PDEP-name + attributes: + label: PDEP number and title + placeholder: > + PDEP-1: Purpose and guidelines + validations: + required: true + - type: input + id: PDEP-link + attributes: + label: Pull request with discussion + description: e.g. https://github.com/pandas-dev/pandas/pull/47444 + validations: + required: true + - type: input + id: PDEP-rendered-link + attributes: + label: Rendered PDEP for easy reading + description: e.g. https://github.com/pandas-dev/pandas/pull/47444/files?short_path=7c449e6#diff-7c449e698132205b235c501f7e47ebba38da4d2b7f9492c98f16745dba787041 + validations: + required: true + - type: input + id: PDEP-number-of-discussion-participants + attributes: + label: Discussion participants + description: > + You may find it useful to list or total the number of participating members in the + PDEP discussion PR. This would be the maximum possible disapprove votes. + placeholder: > + 14 voting members participated in the PR discussion thus far. + - type: input + id: PDEP-vote-end + attributes: + label: Voting will close in 15 days. + description: The voting period end date. ('Voting will close in 15 days.' will be automatically written) + - type: markdown + attributes: + value: --- + - type: textarea + id: Vote + attributes: + label: Vote + value: | + Cast your vote in a comment below. + * +1: approve. + * 0: abstain. + * Reason: A one sentence reason is required. + * -1: disapprove + * Reason: A one sentence reason is required. + A disapprove vote requires prior participation in the linked discussion PR. + + @pandas-dev/pandas-core + validations: + required: true diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index 49a3bc4c871cdd..bb15b8f997b110 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -79,8 +79,8 @@ Next is described the workflow that PDEPs can follow. #### Submitting a PDEP -Proposing a PDEP is done by creating a PR adding a new file to `web/pdeps/`. -The file is a markdown file, you can use `web/pdeps/0001.md` as a reference +Proposing a PDEP is done by creating a PR adding a new file to `web/pandas/pdeps/`. +The file is a markdown file, you can use `web/pandas/pdeps/0001-purpose-and-guidelines.md` as a reference for the expected format. The initial status of a PDEP will be `Status: Draft`. This will be changed to From 8131381c9eb6264d7abb6fe66ef8b892933af5c4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Apr 2024 08:28:30 -1000 Subject: [PATCH 4/5] REF: Clean up some iterator usages (#58267) * Use better data structures * Use generator and set * Move sorted to exception block, use set instead of list * Another iterator, use iter * another set * Dont use iterator protocol --- pandas/_libs/tslibs/offsets.pyx | 12 +++++------- pandas/core/frame.py | 27 ++++++++++++++------------- pandas/core/generic.py | 2 +- pandas/core/internals/construction.py | 15 +++++++-------- pandas/core/tools/datetimes.py | 8 ++++---- 5 files changed, 31 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index e36abdf0ad9713..107608ec9f6060 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -219,8 +219,7 @@ cdef _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64D(dt) for dt in holidays] - holidays = tuple(sorted(holidays)) + holidays = tuple(sorted(_to_dt64D(dt) for dt in holidays)) kwargs = {"weekmask": weekmask} if holidays: @@ -419,11 +418,10 @@ cdef class BaseOffset: if "holidays" in all_paras and not all_paras["holidays"]: all_paras.pop("holidays") - exclude = ["kwds", "name", "calendar"] - attrs = [(k, v) for k, v in all_paras.items() - if (k not in exclude) and (k[0] != "_")] - attrs = sorted(set(attrs)) - params = tuple([str(type(self))] + attrs) + exclude = {"kwds", "name", "calendar"} + attrs = {(k, v) for k, v in all_paras.items() + if (k not in exclude) and (k[0] != "_")} + params = tuple([str(type(self))] + sorted(attrs)) return params @property diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cd4812c3f78ae7..b65a00db7d7df8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2301,8 +2301,8 @@ def maybe_reorder( exclude.update(index) if any(exclude): - arr_exclude = [x for x in exclude if x in arr_columns] - to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arr_exclude = (x for x in exclude if x in arr_columns) + to_remove = {arr_columns.get_loc(col) for col in arr_exclude} arrays = [v for i, v in enumerate(arrays) if i not in to_remove] columns = columns.drop(exclude) @@ -3705,7 +3705,7 @@ def transpose( nv.validate_transpose(args, {}) # construct the args - dtypes = list(self.dtypes) + first_dtype = self.dtypes.iloc[0] if len(self.columns) else None if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. @@ -3723,11 +3723,11 @@ def transpose( elif ( self._is_homogeneous_type - and dtypes - and isinstance(dtypes[0], ExtensionDtype) + and first_dtype is not None + and isinstance(first_dtype, ExtensionDtype) ): new_values: list - if isinstance(dtypes[0], BaseMaskedDtype): + if isinstance(first_dtype, BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import ( transpose_homogeneous_masked_arrays, @@ -3736,7 +3736,7 @@ def transpose( new_values = transpose_homogeneous_masked_arrays( cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) ) - elif isinstance(dtypes[0], ArrowDtype): + elif isinstance(first_dtype, ArrowDtype): # We have arrow EAs with the same dtype. We can transpose faster. from pandas.core.arrays.arrow.array import ( ArrowExtensionArray, @@ -3748,10 +3748,11 @@ def transpose( ) else: # We have other EAs with the same dtype. We preserve dtype in transpose. - dtyp = dtypes[0] - arr_typ = dtyp.construct_array_type() + arr_typ = first_dtype.construct_array_type() values = self.values - new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] + new_values = [ + arr_typ._from_sequence(row, dtype=first_dtype) for row in values + ] result = type(self)._from_arrays( new_values, @@ -5882,7 +5883,7 @@ def set_index( else: arrays.append(self.index) - to_remove: list[Hashable] = [] + to_remove: set[Hashable] = set() for col in keys: if isinstance(col, MultiIndex): arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) @@ -5909,7 +5910,7 @@ def set_index( arrays.append(frame[col]) names.append(col) if drop: - to_remove.append(col) + to_remove.add(col) if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since @@ -5926,7 +5927,7 @@ def set_index( raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): + for c in to_remove: del frame[c] # clear up memory usage diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 523ca9de201bf3..9686c081b5fb38 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2045,7 +2045,7 @@ def __setstate__(self, state) -> None: # e.g. say fill_value needing _mgr to be # defined meta = set(self._internal_names + self._metadata) - for k in list(meta): + for k in meta: if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 73b93110c9018d..cea52bf8c91b27 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -567,7 +567,7 @@ def _extract_index(data) -> Index: if len(data) == 0: return default_index(0) - raw_lengths = [] + raw_lengths = set() indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False @@ -583,7 +583,7 @@ def _extract_index(data) -> Index: indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True - raw_lengths.append(len(val)) + raw_lengths.add(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError("Per-column arrays must each be 1-dimensional") @@ -596,24 +596,23 @@ def _extract_index(data) -> Index: index = union_indexes(indexes, sort=False) if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: + if len(raw_lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) - + raw_length = raw_lengths.pop() if have_series: - if lengths[0] != len(index): + if raw_length != len(index): msg = ( - f"array length {lengths[0]} does not match index " + f"array length {raw_length} does not match index " f"length {len(index)}" ) raise ValueError(msg) else: - index = default_index(lengths[0]) + index = default_index(raw_length) return ensure_index(index) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2aeb1aff07a54f..df7a6cdb1ea52d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1124,18 +1124,18 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(set(required) - set(unit_rev.keys())) + req = set(required) - set(unit_rev.keys()) if len(req): - _required = ",".join(req) + _required = ",".join(sorted(req)) raise ValueError( "to assemble mappings requires at least that " f"[year, month, day] be specified: [{_required}] is missing" ) # keys we don't recognize - excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) + excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): - _excess = ",".join(excess) + _excess = ",".join(sorted(excess)) raise ValueError( f"extra keys have been passed to the datetime assemblage: [{_excess}]" ) From bb0fcc23eed9f6a1a6506c6e27b98fb397ce747e Mon Sep 17 00:00:00 2001 From: Antonio Valentino Date: Tue, 16 Apr 2024 20:49:46 +0200 Subject: [PATCH 5/5] Avoid unnecessary re-opening of HDF5 files (Closes: #58248) (#58275) * Avoid unnecessary re-opening of HDF5 files * Update the whatsnew file * Move the changelog entry for #58248 to the correct section --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/pytables.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0992142f563638..7a4f709e56104f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -331,6 +331,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) @@ -406,7 +407,6 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Period ^^^^^^ - diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5ecf7e287ea58b..3cfd740a513041 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -292,14 +292,14 @@ def to_hdf( dropna=dropna, ) - path_or_buf = stringify_path(path_or_buf) - if isinstance(path_or_buf, str): + if isinstance(path_or_buf, HDFStore): + f(path_or_buf) + else: + path_or_buf = stringify_path(path_or_buf) with HDFStore( path_or_buf, mode=mode, complevel=complevel, complib=complib ) as store: f(store) - else: - f(path_or_buf) def read_hdf(