ERR (string dtype): harmonize setitem error message for python and py…

…arrow storage (#60219)
pandas-dev · Nov 7, 2024 · 692ea6f · 692ea6f
1 parent 0937c95
commit 692ea6f
Show file tree

Hide file tree

Showing 10 changed files with 35 additions and 29 deletions.
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1145,7 +1145,7 @@ def fillna(
         try:
             fill_value = self._box_pa(value, pa_type=self._pa_array.type)
         except pa.ArrowTypeError as err:
-            msg = f"Invalid value '{value!s}' for dtype {self.dtype}"
+            msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
             raise TypeError(msg) from err
 
         try:
@@ -2136,7 +2136,7 @@ def _maybe_convert_setitem_value(self, value):
         try:
             value = self._box_pa(value, self._pa_array.type)
         except pa.ArrowTypeError as err:
-            msg = f"Invalid value '{value!s}' for dtype {self.dtype}"
+            msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
             raise TypeError(msg) from err
         return value
 

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -286,7 +286,7 @@ def _validate_setitem_value(self, value):
 
         # Note: without the "str" here, the f-string rendering raises in
         #  py38 builds.
-        raise TypeError(f"Invalid value '{value!s}' for dtype {self.dtype}")
+        raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'")
 
     def __setitem__(self, key, value) -> None:
         key = check_array_indexer(self, key)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -652,7 +652,8 @@ def _validate_scalar(self, value):
             return self.dtype.na_value
         elif not isinstance(value, str):
             raise TypeError(
-                f"Cannot set non-string value '{value}' into a string array."
+                f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a "
+                f"string or missing value, got '{type(value).__name__}' instead."
             )
         return value
 
@@ -743,7 +744,9 @@ def __setitem__(self, key, value) -> None:
                 value = self.dtype.na_value
             elif not isinstance(value, str):
                 raise TypeError(
-                    f"Cannot set non-string value '{value}' into a StringArray."
+                    f"Invalid value '{value}' for dtype '{self.dtype}'. Value should "
+                    f"be a string or missing value, got '{type(value).__name__}' "
+                    "instead."
                 )
         else:
             if not is_array_like(value):
@@ -753,7 +756,10 @@ def __setitem__(self, key, value) -> None:
                 # compatible, compatibility with arrow backed strings
                 value = np.asarray(value)
             if len(value) and not lib.is_string_array(value, skipna=True):
-                raise TypeError("Must provide strings.")
+                raise TypeError(
+                    "Invalid value for dtype 'str'. Value should be a "
+                    "string or missing value (or array of those)."
+                )
 
             mask = isna(value)
             if mask.any():

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -223,7 +223,10 @@ def insert(self, loc: int, item) -> ArrowStringArray:
         if self.dtype.na_value is np.nan and item is np.nan:
             item = libmissing.NA
         if not isinstance(item, str) and item is not libmissing.NA:
-            raise TypeError("Scalar must be NA or str")
+            raise TypeError(
+                f"Invalid value '{item}' for dtype 'str'. Value should be a "
+                f"string or missing value, got '{type(item).__name__}' instead."
+            )
         return super().insert(loc, item)
 
     def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
@@ -255,13 +258,19 @@ def _maybe_convert_setitem_value(self, value):
             if isna(value):
                 value = None
             elif not isinstance(value, str):
-                raise TypeError("Scalar must be NA or str")
+                raise TypeError(
+                    f"Invalid value '{value}' for dtype 'str'. Value should be a "
+                    f"string or missing value, got '{type(value).__name__}' instead."
+                )
         else:
             value = np.array(value, dtype=object, copy=True)
             value[isna(value)] = None
             for v in value:
                 if not (v is None or isinstance(v, str)):
-                    raise TypeError("Must provide strings")
+                    raise TypeError(
+                        "Invalid value for dtype 'str'. Value should be a "
+                        "string or missing value (or array of those)."
+                    )
         return super()._maybe_convert_setitem_value(value)
 
     def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:

diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py
@@ -8,7 +8,7 @@
 
 class TestSetitemValidation:
     def _check_setitem_invalid(self, arr, invalid):
-        msg = f"Invalid value '{invalid!s}' for dtype {arr.dtype}"
+        msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'"
         msg = re.escape(msg)
         with pytest.raises(TypeError, match=msg):
             arr[0] = invalid

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -109,14 +109,11 @@ def test_none_to_nan(cls, dtype):
 def test_setitem_validates(cls, dtype):
     arr = cls._from_sequence(["a", "b"], dtype=dtype)
 
-    if dtype.storage == "python":
-        msg = "Cannot set non-string value '10' into a StringArray."
-    else:
-        msg = "Scalar must be NA or str"
+    msg = "Invalid value '10' for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    msg = "Must provide strings"
+    msg = "Invalid value for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         arr[:] = np.array([1, 2])
 
@@ -508,10 +505,7 @@ def test_fillna_args(dtype):
     expected = pd.array(["a", "b"], dtype=dtype)
     tm.assert_extension_array_equal(res, expected)
 
-    if dtype.storage == "pyarrow":
-        msg = "Invalid value '1' for dtype str"
-    else:
-        msg = "Cannot set non-string value '1' into a StringArray."
+    msg = "Invalid value '1' for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         arr.fillna(value=1)
 
@@ -727,10 +721,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-    if dtype.storage == "python":
-        msg = "Cannot set non-string value"
-    else:
-        msg = "Scalar must be NA or str"
+    msg = "Invalid value '1' for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         ser[mask] = 1
 

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -1274,7 +1274,7 @@ def test_setting_mismatched_na_into_nullable_fails(
                 r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype",
                 r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype",
                 "'values' contains non-numeric NA",
-                r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}",
+                r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'",
             ]
         )
         with pytest.raises(TypeError, match=msg):

diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
@@ -931,7 +931,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype):
 
     mask = np.array([True, True, False], ndmin=obj.ndim).T
 
-    msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}"
+    msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'"
 
     for null in tm.NP_NAT_OBJECTS + [pd.NaT]:
         # NaT is an NA value that we should *not* cast to pd.NA dtype
@@ -1030,7 +1030,7 @@ def test_where_int_overflow(replacement, using_infer_string):
     df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
     if using_infer_string and replacement not in (None, "snake"):
         with pytest.raises(
-            TypeError, match="Cannot set non-string value|Scalar must be NA or str"
+            TypeError, match=f"Invalid value '{replacement}' for dtype 'str'"
         ):
             df.where(pd.notnull(df), replacement)
         return

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -1230,7 +1230,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string
         # assigning with loc/iloc attempts to set the values inplace, which
         #  in this case is successful
         if using_infer_string:
-            with pytest.raises(TypeError, match="Must provide strings"):
+            with pytest.raises(TypeError, match="Invalid value"):
                 result.loc[result.index, "A"] = [float(x) for x in col_data]
         else:
             result.loc[result.index, "A"] = [float(x) for x in col_data]

diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -864,7 +864,7 @@ def test_index_where(self, obj, key, expected, raises, val, using_infer_string):
         mask[key] = True
 
         if using_infer_string and obj.dtype == object:
-            with pytest.raises(TypeError, match="Scalar must"):
+            with pytest.raises(TypeError, match="Invalid value"):
                 Index(obj).where(~mask, val)
         else:
             res = Index(obj).where(~mask, val)
@@ -877,7 +877,7 @@ def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string
         mask[key] = True
 
         if using_infer_string and obj.dtype == object:
-            with pytest.raises(TypeError, match="Scalar must"):
+            with pytest.raises(TypeError, match="Invalid value"):
                 Index(obj).putmask(mask, val)
         else:
             res = Index(obj).putmask(mask, val)