Skip to content

Commit

Permalink
ENH: change get_dummies default dtype to bool (#48022)
Browse files Browse the repository at this point in the history
* ENH: Warn when dtype is not passed to get_dummies

* Edit get_dummies' dtype warning

* Add whatsnew entry for issue #45848

* Fix dtype warning test

* Suppress warnings in docs

* Edit whatsnew entry

Co-authored-by: Marco Edward Gorelli <[email protected]>

* Fix find_stack_level in get_dummies dtype warning

* Change the default dtype of get_dummies to bool

* Revert dtype(bool) change

* Move the changelog entry to v1.6.0.rst

* Move whatsnew entry to 'Other API changes'

Co-authored-by: Marco Edward Gorelli <[email protected]>
Co-authored-by: Marco Edward Gorelli <[email protected]>
  • Loading branch information
3 people authored Oct 11, 2022
1 parent b48a73f commit bfdf223
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 76 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ Other API changes
^^^^^^^^^^^^^^^^^
- Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`)
- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser.
- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
-
Expand Down
60 changes: 30 additions & 30 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_dummies(
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
dtype : dtype, default np.uint8
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.
Returns
Expand All @@ -89,50 +89,50 @@ def get_dummies(
>>> s = pd.Series(list('abca'))
>>> pd.get_dummies(s)
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
a b c
0 True False False
1 False True False
2 False False True
3 True False False
>>> s1 = ['a', 'b', np.nan]
>>> pd.get_dummies(s1)
a b
0 1 0
1 0 1
2 0 0
a b
0 True False
1 False True
2 False False
>>> pd.get_dummies(s1, dummy_na=True)
a b NaN
0 1 0 0
1 0 1 0
2 0 0 1
a b NaN
0 True False False
1 False True False
2 False False True
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
... 'C': [1, 2, 3]})
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
C col1_a col1_b col2_a col2_b col2_c
0 1 1 0 0 1 0
1 2 0 1 1 0 0
2 3 1 0 0 0 1
0 1 True False False True False
1 2 False True True False False
2 3 True False False False True
>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
4 1 0 0
a b c
0 True False False
1 False True False
2 False False True
3 True False False
4 True False False
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0
b c
0 False False
1 True False
2 False True
3 False False
4 False False
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
a b c
Expand Down Expand Up @@ -236,7 +236,7 @@ def _get_dummies_1d(
codes, levels = factorize_from_iterable(Series(data))

if dtype is None:
dtype = np.dtype(np.uint8)
dtype = np.dtype(bool)
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"; expected "Type[Any]"
dtype = np.dtype(dtype) # type: ignore[arg-type]
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/frame/indexing/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ def test_getitem_list_of_labels_categoricalindex_cols(self):
# GH#16115
cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])

expected = DataFrame(
[[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats
)
expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats)
dummies = get_dummies(cats)
result = dummies[list(dummies.columns)]
tm.assert_frame_equal(result, expected)
Expand Down
90 changes: 47 additions & 43 deletions pandas/tests/reshape/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test_get_dummies_unicode(self, sparse):
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
exp = DataFrame(
{"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)
Expand All @@ -182,15 +182,15 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
result = get_dummies(df, sparse=sparse)
expected = DataFrame(
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
dtype=np.uint8,
dtype=bool,
)
if sparse:
expected = DataFrame(
{
"A_a": SparseArray([1, 0, 1], dtype="uint8"),
"A_b": SparseArray([0, 1, 0], dtype="uint8"),
"B_b": SparseArray([1, 1, 0], dtype="uint8"),
"B_c": SparseArray([0, 0, 1], dtype="uint8"),
"A_a": SparseArray([1, 0, 1], dtype="bool"),
"A_b": SparseArray([0, 1, 0], dtype="bool"),
"B_b": SparseArray([1, 1, 0], dtype="bool"),
"B_c": SparseArray([0, 0, 1], dtype="bool"),
}
)

Expand All @@ -208,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df):
"B_b": [1, 1, 0],
"B_c": [0, 0, 1],
},
dtype=np.uint8,
dtype=bool,
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -238,12 +238,11 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
expected = DataFrame(
{
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
"from_B_b": [1, 1, 0],
"from_B_c": [0, 0, 1],
"from_A_a": [True, False, True],
"from_A_b": [False, True, False],
"from_B_b": [True, True, False],
"from_B_c": [False, False, True],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
Expand All @@ -258,9 +257,12 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
result = get_dummies(df, prefix="bad", sparse=sparse)
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
[
[1, True, False, True, False],
[2, False, True, True, False],
[3, True, False, False, True],
],
columns=["C"] + bad_columns,
dtype=np.uint8,
)
expected = expected.astype({"C": np.int64})
if sparse:
Expand All @@ -269,10 +271,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
expected = pd.concat(
[
Series([1, 2, 3], name="C"),
Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"),
Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"),
Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
],
axis=1,
)
Expand All @@ -290,30 +292,29 @@ def test_dataframe_dummies_subset(self, df, sparse):
},
)
cols = expected.columns
expected[cols[1:]] = expected[cols[1:]].astype(np.uint8)
expected[cols[1:]] = expected[cols[1:]].astype(bool)
expected[["C"]] = df[["C"]]
if sparse:
cols = ["from_A_a", "from_A_b"]
expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep="..", sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"A..a": [1, 0, 1],
"A..b": [0, 1, 0],
"B..b": [1, 1, 0],
"B..c": [0, 0, 1],
"A..a": [True, False, True],
"A..b": [False, True, False],
"B..b": [True, True, False],
"B..c": [False, False, True],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
if sparse:
cols = ["A..a", "A..b", "B..b", "B..c"]
expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -356,9 +357,9 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
)

columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected[columns] = expected[columns].astype(np.uint8)
expected[columns] = expected[columns].astype(bool)
if sparse:
expected[columns] = expected[columns].astype(SparseDtype("uint8", 0))
expected[columns] = expected[columns].astype(SparseDtype("bool", 0))

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -422,19 +423,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
[
(
{"data": DataFrame({"ä": ["a"]})},
DataFrame({"ä_a": [1]}, dtype=np.uint8),
DataFrame({"ä_a": [True]}),
),
(
{"data": DataFrame({"x": ["ä"]})},
DataFrame({"x_ä": [1]}, dtype=np.uint8),
DataFrame({"x_ä": [True]}),
),
(
{"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
DataFrame({"ä_a": [1]}, dtype=np.uint8),
DataFrame({"ä_a": [True]}),
),
(
{"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
DataFrame({"xäa": [1]}, dtype=np.uint8),
DataFrame({"xäa": [True]}),
),
],
)
Expand All @@ -451,7 +452,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))

expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)

result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
Expand Down Expand Up @@ -487,14 +488,14 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)

tm.assert_frame_equal(res, exp)

res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
["b", np.nan], axis=1
)
if sparse:
Expand All @@ -510,7 +511,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
tm.assert_frame_equal(result, expected)
Expand All @@ -522,7 +523,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
)
cols = ["A_b", "B_c", "cat_y"]
expected[cols] = expected[cols].astype(np.uint8)
expected[cols] = expected[cols].astype(bool)
expected = expected[["C", "A_b", "B_c", "cat_y"]]
if sparse:
for col in cols:
Expand All @@ -544,7 +545,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
}
)
cols = ["A_b", "A_nan", "B_c", "B_nan"]
expected[cols] = expected[cols].astype(np.uint8)
expected[cols] = expected[cols].astype(bool)
expected = expected.sort_index(axis=1)
if sparse:
for col in cols:
Expand All @@ -559,13 +560,13 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
def test_get_dummies_int_int(self):
data = Series([1, 2, 1])
result = get_dummies(data)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
tm.assert_frame_equal(result, expected)

data = Series(Categorical(["a", "b", "a"]))
result = get_dummies(data)
expected = DataFrame(
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -616,9 +617,12 @@ def test_get_dummies_duplicate_columns(self, df):
result = get_dummies(df).sort_index(axis=1)

expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
[
[1, True, False, True, False],
[2, False, True, True, False],
[3, True, False, False, True],
],
columns=["A", "A_a", "A_b", "A_b", "A_c"],
dtype=np.uint8,
).sort_index(axis=1)

expected = expected.astype({"A": np.int64})
Expand All @@ -628,7 +632,7 @@ def test_get_dummies_duplicate_columns(self, df):
def test_get_dummies_all_sparse(self):
df = DataFrame({"A": [1, 2]})
result = get_dummies(df, columns=["A"], sparse=True)
dtype = SparseDtype("uint8", 0)
dtype = SparseDtype("bool", 0)
expected = DataFrame(
{
"A_1": SparseArray([1, 0], dtype=dtype),
Expand Down

0 comments on commit bfdf223

Please sign in to comment.