Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: share Index/Block get_values_for_csv #55485

Merged
merged 3 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 127 additions & 27 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
algos as libalgos,
index as libindex,
lib,
writers,
)
from pandas._libs.internals import BlockValuesRefs
import pandas._libs.join as libjoin
Expand Down Expand Up @@ -97,7 +98,6 @@
is_bool_dtype,
is_ea_or_datetimelike_dtype,
is_float,
is_float_dtype,
is_hashable,
is_integer,
is_iterator,
Expand All @@ -119,6 +119,7 @@
ExtensionDtype,
IntervalDtype,
PeriodDtype,
SparseDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
Expand Down Expand Up @@ -151,7 +152,9 @@
ArrowExtensionArray,
BaseMaskedArray,
Categorical,
DatetimeArray,
ExtensionArray,
TimedeltaArray,
)
from pandas.core.arrays.string_ import StringArray
from pandas.core.base import (
Expand Down Expand Up @@ -199,7 +202,10 @@
MultiIndex,
Series,
)
from pandas.core.arrays import PeriodArray
from pandas.core.arrays import (
IntervalArray,
PeriodArray,
)

__all__ = ["Index"]

Expand Down Expand Up @@ -1403,7 +1409,7 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str
result = trim_front(formatted)
return header + result

def _format_native_types(
def _get_values_for_csv(
self,
*,
na_rep: str_t = "",
Expand All @@ -1412,30 +1418,14 @@ def _format_native_types(
date_format=None,
quoting=None,
) -> npt.NDArray[np.object_]:
"""
Actually format specific types of the index.
"""
from pandas.io.formats.format import FloatArrayFormatter

if is_float_dtype(self.dtype) and not isinstance(self.dtype, ExtensionDtype):
formatter = FloatArrayFormatter(
self._values,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
quoting=quoting,
fixed_width=False,
)
return formatter.get_result_as_array()

mask = isna(self)
if self.dtype != object and not quoting:
values = np.asarray(self).astype(str)
else:
values = np.array(self, dtype=object, copy=True)

values[mask] = na_rep
return values
return get_values_for_csv(
self._values,
na_rep=na_rep,
decimal=decimal,
float_format=float_format,
date_format=date_format,
quoting=quoting,
)

def _summary(self, name=None) -> str_t:
"""
Expand Down Expand Up @@ -7629,3 +7619,113 @@ def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None):
stacklevel=find_stack_level(),
)
return result


def get_values_for_csv(
values: ArrayLike,
*,
date_format,
na_rep: str = "nan",
quoting=None,
float_format=None,
decimal: str = ".",
) -> npt.NDArray[np.object_]:
"""
Convert to types which can be consumed by the standard library's
csv.writer.writerows.
"""
if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm":
# GH#40754 Convert categorical datetimes to datetime array
values = algos.take_nd(
values.categories._values,
ensure_platform_int(values._codes),
fill_value=na_rep,
)

values = ensure_wrapped_if_datetimelike(values)

if isinstance(values, (DatetimeArray, TimedeltaArray)):
if values.ndim == 1:
result = values._format_native_types(na_rep=na_rep, date_format=date_format)
result = result.astype(object, copy=False)
return result

# GH#21734 Process every column separately, they might have different formats
results_converted = []
for i in range(len(values)):
result = values[i, :]._format_native_types(
na_rep=na_rep, date_format=date_format
)
results_converted.append(result.astype(object, copy=False))
return np.vstack(results_converted)

elif isinstance(values.dtype, PeriodDtype):
# TODO: tests that get here in column path
values = cast("PeriodArray", values)
res = values._format_native_types(na_rep=na_rep, date_format=date_format)
return res

elif isinstance(values.dtype, IntervalDtype):
# TODO: tests that get here in column path
values = cast("IntervalArray", values)
mask = values.isna()
if not quoting:
result = np.asarray(values).astype(str)
else:
result = np.array(values, dtype=object, copy=True)

result[mask] = na_rep
return result

elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype):
# see GH#13418: no special formatting is desired at the
# output (important for appropriate 'quoting' behaviour),
# so do not pass it through the FloatArrayFormatter
if float_format is None and decimal == ".":
mask = isna(values)

if not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype="object")

values[mask] = na_rep
values = values.astype(object, copy=False)
return values

from pandas.io.formats.format import FloatArrayFormatter

formatter = FloatArrayFormatter(
values,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
quoting=quoting,
fixed_width=False,
)
res = formatter.get_result_as_array()
res = res.astype(object, copy=False)
return res

elif isinstance(values, ExtensionArray):
mask = isna(values)

new_values = np.asarray(values.astype(object))
new_values[mask] = na_rep
return new_values

else:
mask = isna(values)
itemsize = writers.word_len(na_rep)

if values.dtype != _dtype_obj and not quoting and itemsize:
values = values.astype(str)
if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
# enlarge for the na_rep
values = values.astype(f"<U{itemsize}")
else:
values = np.array(values, dtype="object")

values[mask] = na_rep
values = values.astype(object, copy=False)
return values
2 changes: 1 addition & 1 deletion pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def _format_with_header(
# TODO: not reached in tests 2023-10-11
# matches base class except for whitespace padding and date_format
return header + list(
self._format_native_types(na_rep=na_rep, date_format=date_format)
self._get_values_for_csv(na_rep=na_rep, date_format=date_format)
)

@property
Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def _new_DatetimeIndex(cls, d):
"tzinfo",
"dtype",
"to_pydatetime",
"_format_native_types",
"date",
"time",
"timetz",
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,15 +1384,15 @@ def _formatter_func(self, tup):
formatter_funcs = [level._formatter_func for level in self.levels]
return tuple(func(val) for func, val in zip(formatter_funcs, tup))

def _format_native_types(
def _get_values_for_csv(
self, *, na_rep: str = "nan", **kwargs
) -> npt.NDArray[np.object_]:
new_levels = []
new_codes = []

# go through the levels and format them
for level, level_codes in zip(self.levels, self.codes):
level_strs = level._format_native_types(na_rep=na_rep, **kwargs)
level_strs = level._get_values_for_csv(na_rep=na_rep, **kwargs)
# add nan values, if there are any
mask = level_codes == -1
if mask.any():
Expand All @@ -1408,7 +1408,7 @@ def _format_native_types(

if len(new_levels) == 1:
# a single-level multi-index
return Index(new_levels[0].take(new_codes[0]))._format_native_types()
return Index(new_levels[0].take(new_codes[0]))._get_values_for_csv()
else:
# reconstruct the multi-index
mi = MultiIndex(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def _new_PeriodIndex(cls, **d):
PeriodArray,
wrap=True,
)
@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray)
@inherit_names(["is_leap_year"], PeriodArray)
class PeriodIndex(DatetimeIndexOpsMixin):
"""
Immutable ndarray holding ordinal values indicating regular periods in time.
Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
"sum",
"std",
"median",
"_format_native_types",
],
TimedeltaArray,
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
Index,
ensure_index,
)
from pandas.core.indexes.base import get_values_for_csv
from pandas.core.internals.base import (
DataManager,
SingleDataManager,
Expand All @@ -79,7 +80,6 @@
ensure_block_shape,
external_values,
extract_pandas_array,
get_values_for_csv,
maybe_coerce_values,
new_block,
)
Expand Down
Loading
Loading