diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8893fe0ecd398..9fed42690e54b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -192,6 +192,7 @@ Other enhancements - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) +- Allow passing ``pat_dict`` argument to :meth:`pandas.Series.str.replace` (:issue:`51748`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..6e089d9c5a987 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1394,12 +1394,13 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): @forbid_nonstring_types(["bytes"]) def replace( self, - pat: str | re.Pattern, - repl: str | Callable, + pat: str | re.Pattern | None = None, + repl: str | Callable | None = None, n: int = -1, case: bool | None = None, flags: int = 0, regex: bool = False, + pat_dict: dict | None = None, ): r""" Replace each occurrence of pattern/regex in the Series/Index. @@ -1434,6 +1435,9 @@ def replace( - If False, treats the pattern as a literal string - Cannot be set to False if `pat` is a compiled regex or `repl` is a callable. + pat_dict: dict, default None + pairs representing strings to be replaced, and their + updated values. Returns ------- @@ -1456,6 +1460,15 @@ def replace( Examples -------- + When `pat_dict` is a dictionary, every key in `pat_dict` is replaced + with its corresponding value: + + >>> pd.Series(['A', 'B', np.nan]).str.replace(pat_dict={'A': 'a', 'B': 'b'}) + 0 a + 1 b + 2 NaN + dtype: object + When `pat` is a string and `regex` is True, the given `pat` is compiled as a regex. When `repl` is a string, it replaces matching regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are @@ -1518,8 +1531,13 @@ def replace( 2 NaN dtype: object """ + if pat is None and pat_dict is None: + raise ValueError( + "Cannot replace a string without specifying a string to be modified." + ) + # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): + if not (isinstance(repl, str) or callable(repl)) and pat_dict is None: raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) @@ -1539,10 +1557,21 @@ def replace( if case is None: case = True - result = self._data.array._str_replace( - pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) + if pat_dict: + res_output = self._data + for key, value in pat_dict.items(): + result = res_output.array._str_replace( + key, str(value), n=n, case=case, flags=flags, regex=regex + ) + res_output = self._wrap_result(result) + + else: + result = self._data.array._str_replace( + pat, repl, n=n, case=case, flags=flags, regex=regex + ) + res_output = self._wrap_result(result) + + return res_output @forbid_nonstring_types(["bytes"]) def repeat(self, repeats): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..5daf8ba925c3e 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -354,6 +354,29 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): # -------------------------------------------------------------------------------------- # str.replace # -------------------------------------------------------------------------------------- +def test_replace_dict_invalid(any_string_dtype): + # New replace behavior introduced in #51914 + msg = "Cannot replace a string without specifying a string to be modified." + series = Series(data=["A", "B_junk", "C_gunk"], name="my_messy_col") + + with pytest.raises(ValueError, match=msg): + series.str.replace() + + +def test_replace_dict(any_string_dtype): + # GH 51914 + series = Series(data=["A", "B_junk", "C_gunk"], name="my_messy_col") + new_series1 = series.str.replace(pat_dict={"_gunk": "_junk"}) + expected1 = Series(data=["A", "B_junk", "C_junk"], name="my_messy_col") + tm.assert_series_equal(new_series1, expected1) + + +def test_replace_multi_dict(any_string_dtype): + # GH 51914 + series = Series(data=["A", "B", "C"], name="my_messy_col") + new_series = series.str.replace(pat_dict={"A": "a", "B": "b"}) + expected = Series(data=["a", "b", "C"], name="my_messy_col") + tm.assert_series_equal(new_series, expected) def test_replace(any_string_dtype):