From bfdf223133541da7e0002543e36bf71ba59af481 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Tue, 11 Oct 2022 18:29:10 +0200 Subject: [PATCH] ENH: change get_dummies default dtype to bool (#48022) * ENH: Warn when dtype is not passed to get_dummies * Edit get_dummies' dtype warning * Add whatsnew entry for issue #45848 * Fix dtype warning test * Suppress warnings in docs * Edit whatsnew entry Co-authored-by: Marco Edward Gorelli * Fix find_stack_level in get_dummies dtype warning * Change the default dtype of get_dummies to bool * Revert dtype(bool) change * Move the changelog entry to v1.6.0.rst * Move whatsnew entry to 'Other API changes' Co-authored-by: Marco Edward Gorelli Co-authored-by: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/core/reshape/encoding.py | 60 +++++++------- pandas/tests/frame/indexing/test_getitem.py | 4 +- pandas/tests/reshape/test_get_dummies.py | 90 +++++++++++---------- 4 files changed, 79 insertions(+), 76 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index bdaea89776b7c..0cad6f3caaf91 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -118,6 +118,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`) - :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. +- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`) - :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`) - :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`) - diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 6670633fcc587..a39e3c1f10956 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -66,7 +66,7 @@ def get_dummies( drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. - dtype : dtype, default np.uint8 + dtype : dtype, default bool Data type for new columns. Only a single dtype is allowed. Returns @@ -89,50 +89,50 @@ def get_dummies( >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) - a b c - 0 1 0 0 - 1 0 1 0 - 2 0 0 1 - 3 1 0 0 + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) - a b - 0 1 0 - 1 0 1 - 2 0 0 + a b + 0 True False + 1 False True + 2 False False >>> pd.get_dummies(s1, dummy_na=True) - a b NaN - 0 1 0 0 - 1 0 1 0 - 2 0 0 1 + a b NaN + 0 True False False + 1 False True False + 2 False False True >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c - 0 1 1 0 0 1 0 - 1 2 0 1 1 0 0 - 2 3 1 0 0 0 1 + 0 1 True False False True False + 1 2 False True True False False + 2 3 True False False False True >>> pd.get_dummies(pd.Series(list('abcaa'))) - a b c - 0 1 0 0 - 1 0 1 0 - 2 0 0 1 - 3 1 0 0 - 4 1 0 0 + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + 4 True False False >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) - b c - 0 0 0 - 1 1 0 - 2 0 1 - 3 0 0 - 4 0 0 + b c + 0 False False + 1 True False + 2 False True + 3 False False + 4 False False >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c @@ -236,7 +236,7 @@ def _get_dummies_1d( codes, levels = factorize_from_iterable(Series(data)) if dtype is None: - dtype = np.dtype(np.uint8) + dtype = np.dtype(bool) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index f5c85bd98d8ad..0c1b206cc39bb 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -52,9 +52,7 @@ def test_getitem_list_of_labels_categoricalindex_cols(self): # GH#16115 cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) - expected = DataFrame( - [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats - ) + expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats) dummies = get_dummies(cats) result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 6c9a60caaa2be..4345a357a0ba8 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -171,7 +171,7 @@ def test_get_dummies_unicode(self, sparse): s = [e, eacute, eacute] res = get_dummies(s, prefix="letter", sparse=sparse) exp = DataFrame( - {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8 + {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]} ) if sparse: exp = exp.apply(SparseArray, fill_value=0) @@ -182,15 +182,15 @@ def test_dataframe_dummies_all_obj(self, df, sparse): result = get_dummies(df, sparse=sparse) expected = DataFrame( {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]}, - dtype=np.uint8, + dtype=bool, ) if sparse: expected = DataFrame( { - "A_a": SparseArray([1, 0, 1], dtype="uint8"), - "A_b": SparseArray([0, 1, 0], dtype="uint8"), - "B_b": SparseArray([1, 1, 0], dtype="uint8"), - "B_c": SparseArray([0, 0, 1], dtype="uint8"), + "A_a": SparseArray([1, 0, 1], dtype="bool"), + "A_b": SparseArray([0, 1, 0], dtype="bool"), + "B_b": SparseArray([1, 1, 0], dtype="bool"), + "B_c": SparseArray([0, 0, 1], dtype="bool"), } ) @@ -208,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df): "B_b": [1, 1, 0], "B_c": [0, 0, 1], }, - dtype=np.uint8, + dtype=bool, ) tm.assert_frame_equal(result, expected) @@ -238,12 +238,11 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): expected = DataFrame( { "C": [1, 2, 3], - "from_A_a": [1, 0, 1], - "from_A_b": [0, 1, 0], - "from_B_b": [1, 1, 0], - "from_B_c": [0, 0, 1], + "from_A_a": [True, False, True], + "from_A_b": [False, True, False], + "from_B_b": [True, True, False], + "from_B_c": [False, False, True], }, - dtype=np.uint8, ) expected[["C"]] = df[["C"]] cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] @@ -258,9 +257,12 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): result = get_dummies(df, prefix="bad", sparse=sparse) bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"] expected = DataFrame( - [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + [ + [1, True, False, True, False], + [2, False, True, True, False], + [3, True, False, False, True], + ], columns=["C"] + bad_columns, - dtype=np.uint8, ) expected = expected.astype({"C": np.int64}) if sparse: @@ -269,10 +271,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): expected = pd.concat( [ Series([1, 2, 3], name="C"), - Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), - Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + Series([True, False, True], name="bad_a", dtype="Sparse[bool]"), + Series([False, True, False], name="bad_b", dtype="Sparse[bool]"), + Series([True, True, False], name="bad_b", dtype="Sparse[bool]"), + Series([False, False, True], name="bad_c", dtype="Sparse[bool]"), ], axis=1, ) @@ -290,11 +292,11 @@ def test_dataframe_dummies_subset(self, df, sparse): }, ) cols = expected.columns - expected[cols[1:]] = expected[cols[1:]].astype(np.uint8) + expected[cols[1:]] = expected[cols[1:]].astype(bool) expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] - expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("bool", 0)) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): @@ -302,18 +304,17 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse): expected = DataFrame( { "C": [1, 2, 3], - "A..a": [1, 0, 1], - "A..b": [0, 1, 0], - "B..b": [1, 1, 0], - "B..c": [0, 0, 1], + "A..a": [True, False, True], + "A..b": [False, True, False], + "B..b": [True, True, False], + "B..c": [False, False, True], }, - dtype=np.uint8, ) expected[["C"]] = df[["C"]] expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: cols = ["A..a", "A..b", "B..b", "B..c"] - expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("bool", 0)) tm.assert_frame_equal(result, expected) @@ -356,9 +357,9 @@ def test_dataframe_dummies_prefix_dict(self, sparse): ) columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] - expected[columns] = expected[columns].astype(np.uint8) + expected[columns] = expected[columns].astype(bool) if sparse: - expected[columns] = expected[columns].astype(SparseDtype("uint8", 0)) + expected[columns] = expected[columns].astype(SparseDtype("bool", 0)) tm.assert_frame_equal(result, expected) @@ -422,19 +423,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): [ ( {"data": DataFrame({"ä": ["a"]})}, - DataFrame({"ä_a": [1]}, dtype=np.uint8), + DataFrame({"ä_a": [True]}), ), ( {"data": DataFrame({"x": ["ä"]})}, - DataFrame({"x_ä": [1]}, dtype=np.uint8), + DataFrame({"x_ä": [True]}), ), ( {"data": DataFrame({"x": ["a"]}), "prefix": "ä"}, - DataFrame({"ä_a": [1]}, dtype=np.uint8), + DataFrame({"ä_a": [True]}), ), ( {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, - DataFrame({"xäa": [1]}, dtype=np.uint8), + DataFrame({"xäa": [True]}), ), ], ) @@ -451,7 +452,7 @@ def test_get_dummies_basic_drop_first(self, sparse): s_series = Series(s_list) s_series_index = Series(s_list, list("ABC")) - expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8) + expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool) result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: @@ -487,14 +488,14 @@ def test_get_dummies_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ["a", "b", np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) - exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) + exp = DataFrame({"b": [0, 1, 0]}, dtype=bool) if sparse: exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) - exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex( + exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex( ["b", np.nan], axis=1 ) if sparse: @@ -510,7 +511,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse): def test_dataframe_dummies_drop_first(self, df, sparse): df = df[["A", "B"]] result = get_dummies(df, drop_first=True, sparse=sparse) - expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) + expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool) if sparse: expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) @@ -522,7 +523,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} ) cols = ["A_b", "B_c", "cat_y"] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(bool) expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: @@ -544,7 +545,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): } ) cols = ["A_b", "A_nan", "B_c", "B_nan"] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(bool) expected = expected.sort_index(axis=1) if sparse: for col in cols: @@ -559,13 +560,13 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): def test_get_dummies_int_int(self): data = Series([1, 2, 1]) result = get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool) tm.assert_frame_equal(result, expected) data = Series(Categorical(["a", "b", "a"])) result = get_dummies(data) expected = DataFrame( - [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8 + [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool ) tm.assert_frame_equal(result, expected) @@ -616,9 +617,12 @@ def test_get_dummies_duplicate_columns(self, df): result = get_dummies(df).sort_index(axis=1) expected = DataFrame( - [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + [ + [1, True, False, True, False], + [2, False, True, True, False], + [3, True, False, False, True], + ], columns=["A", "A_a", "A_b", "A_b", "A_c"], - dtype=np.uint8, ).sort_index(axis=1) expected = expected.astype({"A": np.int64}) @@ -628,7 +632,7 @@ def test_get_dummies_duplicate_columns(self, df): def test_get_dummies_all_sparse(self): df = DataFrame({"A": [1, 2]}) result = get_dummies(df, columns=["A"], sparse=True) - dtype = SparseDtype("uint8", 0) + dtype = SparseDtype("bool", 0) expected = DataFrame( { "A_1": SparseArray([1, 0], dtype=dtype),