Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-46858][PYTHON][PS][BUILD] Upgrade Pandas to 2.2.0 #44881

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
9ae857a
[SPARK-46858][PYTHON][PS][INFRA] Upgrade Pandas to 2.2.0
itholic Jan 25, 2024
5caa678
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Jan 29, 2024
e9a6445
pin version
itholic Jan 29, 2024
edb3d9a
fix series default name issue
itholic Jan 29, 2024
5440381
upperbound for PyPy3
itholic Jan 30, 2024
3e66505
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Feb 13, 2024
8643ebd
Fix melt
itholic Feb 13, 2024
a8237b4
Fix test util related changes
itholic Feb 13, 2024
836dcfe
Fix more test utils
itholic Feb 13, 2024
d3c5f57
Fix resample test
itholic Feb 13, 2024
66f69a2
Rule code mapping
itholic Feb 14, 2024
9d4e8a1
Fix booleanops tests
itholic Feb 14, 2024
37300e8
use proper rule code
itholic Feb 14, 2024
ea57fdb
Fix unsupported cases
itholic Feb 14, 2024
a3f3e91
Fix Categorical test
itholic Feb 15, 2024
8a24900
Fix SparkConnectFunctionsTests
itholic Feb 15, 2024
b727550
Fix linter
itholic Feb 15, 2024
e92082f
Fix NumOpsTests
itholic Feb 16, 2024
5f62fcc
Fix FrameReshapingTests
itholic Feb 16, 2024
f235780
ResampleSeriesTests
itholic Feb 16, 2024
ad67735
Fix ReverseTests
itholic Feb 16, 2024
4e6c77a
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Feb 16, 2024
26b7bd6
revert unrelated changes
itholic Feb 16, 2024
4c84b2a
Fix plotting
itholic Feb 16, 2024
fbbaf88
Fix BoxPlot
itholic Feb 19, 2024
0ca4aa6
Fix concat bug in Pandas
itholic Feb 19, 2024
7536263
Fix DataFrame hist plot
itholic Feb 19, 2024
b07e608
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Feb 20, 2024
acd7b7f
Add release note
itholic Feb 20, 2024
d560825
has -> have
itholic Feb 20, 2024
6de7931
Make resample work in old pandas as well
itholic Feb 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \
ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.1.4' scipy coverage matplotlib lxml
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.2.0' scipy coverage matplotlib lxml
itholic marked this conversation as resolved.
Show resolved Hide resolved


ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.1.4 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.2.0 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
# Python deps for Spark Connect
ARG CONNECT_PIP_PKGS="grpcio==1.59.3 grpcio-status==1.59.3 protobuf==4.25.1 googleapis-common-protos==1.56.4"

Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10607,8 +10607,10 @@ def melt(
name_like_string(name) if name is not None else "variable_{}".format(i)
for i, name in enumerate(self._internal.column_label_names)
]
elif isinstance(var_name, str):
var_name = [var_name]
elif is_list_like(var_name):
raise ValueError(f"{var_name=} must be a scalar.")
else:
var_name = [var_name] # type: ignore[list-item]

pairs = F.explode(
F.array(
Expand Down
3 changes: 1 addition & 2 deletions python/pyspark/pandas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@
from pyspark.pandas.frame import DataFrame, _reduce_spark_multi
from pyspark.pandas.internal import (
InternalFrame,
DEFAULT_SERIES_NAME,
HIDDEN_COLUMNS,
SPARK_INDEX_NAME_FORMAT,
)
Expand Down Expand Up @@ -2554,7 +2553,7 @@ def resolve_func(psdf, this_column_labels, that_column_labels):
if isinstance(obj, Series):
num_series += 1
series_names.add(obj.name)
new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME))
new_objs.append(obj.to_frame())
itholic marked this conversation as resolved.
Show resolved Hide resolved
else:
assert isinstance(obj, DataFrame)
new_objs.append(obj)
Expand Down
50 changes: 35 additions & 15 deletions python/pyspark/pandas/plot/matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import matplotlib as mat
import numpy as np
from matplotlib.axes._base import _process_plot_format # type: ignore[attr-defined]
from matplotlib.figure import Figure
from pandas.core.dtypes.inference import is_list_like
from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib import ( # type: ignore[attr-defined]
Expand Down Expand Up @@ -277,7 +278,7 @@ def _compute_plot_data(self):

self.data = {labels[0]: stats}

def _make_plot(self):
def _make_plot(self, fig: Figure):
bxpstats = list(self.data.values())[0]
ax = self._get_ax(0)
kwds = self.kwds.copy()
Expand Down Expand Up @@ -363,10 +364,23 @@ def _args_adjust(self):
if is_list_like(self.bottom):
self.bottom = np.array(self.bottom)

def _ensure_frame(self, data):
return data

def _calculate_bins(self, data, bins):
return bins
Comment on lines +413 to +414
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pandas recently pushed couple of commits for refactoring the internal plotting structure such as pandas-dev/pandas#55850 or pandas-dev/pandas#55872, so we also should inherits couple of internal methods to follow the latest Pandas behavior.


def _compute_plot_data(self):
self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins)

def _make_plot(self):
def _make_plot_keywords(self, kwds, y):
"""merge BoxPlot/KdePlot properties to passed kwds"""
# y is required for KdePlot
kwds["bottom"] = self.bottom
kwds["bins"] = self.bins
return kwds

def _make_plot(self, fig: Figure):
# TODO: this logic is similar to KdePlot. Might have to deduplicate it.
# 'num_colors' requires to calculate `shape` which has to count all.
# Use 1 for now to save the computation.
Expand Down Expand Up @@ -423,9 +437,9 @@ class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_top_n(data), **kwargs)

def _make_plot(self):
def _make_plot(self, fig: Figure):
self.set_result_text(self._get_ax(0))
super()._make_plot()
super()._make_plot(fig)


class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
Expand All @@ -434,9 +448,9 @@ class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_sampled(data), **kwargs)

def _make_plot(self):
def _make_plot(self, fig: Figure):
self.set_result_text(self._get_ax(0))
super()._make_plot()
super()._make_plot(fig)


class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
Expand All @@ -445,9 +459,9 @@ class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_sampled(data), **kwargs)

def _make_plot(self):
def _make_plot(self, fig: Figure):
self.set_result_text(self._get_ax(0))
super()._make_plot()
super()._make_plot(fig)


class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
Expand All @@ -456,9 +470,9 @@ class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_top_n(data), **kwargs)

def _make_plot(self):
def _make_plot(self, fig: Figure):
self.set_result_text(self._get_ax(0))
super()._make_plot()
super()._make_plot(fig)


class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
Expand All @@ -467,9 +481,9 @@ class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
def __init__(self, data, x, y, **kwargs):
super().__init__(self.get_top_n(data), x, y, **kwargs)

def _make_plot(self):
def _make_plot(self, fig: Figure):
self.set_result_text(self._get_ax(0))
super()._make_plot()
super()._make_plot(fig)


class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
Expand All @@ -478,7 +492,12 @@ class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
def _compute_plot_data(self):
self.data = KdePlotBase.prepare_kde_data(self.data)

def _make_plot(self):
def _make_plot_keywords(self, kwds, y):
kwds["bw_method"] = self.bw_method
kwds["ind"] = type(self)._get_ind(y, ind=self.ind)
return kwds

def _make_plot(self, fig: Figure):
# 'num_colors' requires to calculate `shape` which has to count all.
# Use 1 for now to save the computation.
colors = self._get_colors(num_colors=1)
Expand Down Expand Up @@ -515,8 +534,9 @@ def _make_plot(self):
self, "_append_legend_handles_labels"
) else self._add_legend_handle(artists[0], label, index=i)

def _get_ind(self, y):
return KdePlotBase.get_ind(y, self.ind)
@staticmethod
def _get_ind(y, ind):
return KdePlotBase.get_ind(y, ind)

@classmethod
def _plot(
Expand Down
13 changes: 7 additions & 6 deletions python/pyspark/pandas/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def __init__(
self._resamplekey = resamplekey

self._offset = to_offset(rule)
if self._offset.rule_code not in ["A-DEC", "M", "D", "H", "T", "S"]:

if self._offset.rule_code not in ["A-DEC", "ME", "D", "h", "min", "s"]:
raise ValueError("rule code {} is not supported".format(self._offset.rule_code))
if not getattr(self._offset, "n") > 0:
raise ValueError("rule offset must be positive")
Expand Down Expand Up @@ -184,7 +185,7 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
)
)

elif rule_code == "M":
elif rule_code == "ME":
assert (
origin.is_month_end
and origin.hour == 0
Expand Down Expand Up @@ -264,8 +265,8 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:

ret = F.when(edge_cond, edge_label).otherwise(non_edge_label)

elif rule_code in ["H", "T", "S"]:
unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"}
elif rule_code in ["h", "min", "s"]:
unit_mapping = {"h": "HOUR", "min": "MINUTE", "s": "SECOND"}
unit_str = unit_mapping[rule_code]

truncated_ts_scol = F.date_trunc(unit_str, ts_scol)
Expand All @@ -274,10 +275,10 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
diff = timestampdiff(unit_str, origin_scol, truncated_ts_scol)
mod = F.lit(0) if n == 1 else (diff % F.lit(n))

if rule_code == "H":
if rule_code == "h":
assert origin.minute == 0 and origin.second == 0
edge_cond = (mod == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0)
elif rule_code == "T":
elif rule_code == "min":
assert origin.second == 0
edge_cond = (mod == 0) & (F.second(ts_scol) == 0)
else:
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7092,15 +7092,15 @@ def resample(
----------
rule : str
The offset string or object representing target conversion.
Currently, supported units are {'Y', 'A', 'M', 'D', 'H',
'T', 'MIN', 'S'}.
Currently, supported units are {'YE', 'A', 'ME', 'D', 'h',
'min', 'MIN', 's'}.
itholic marked this conversation as resolved.
Show resolved Hide resolved
closed : {{'right', 'left'}}, default None
Which side of bin interval is closed. The default is 'left'
for all frequency offsets except for 'A', 'Y' and 'M' which all
for all frequency offsets except for 'A', 'YE' and 'ME' which all
have a default of 'right'.
label : {{'right', 'left'}}, default None
Which bin edge label to label bucket with. The default is 'left'
for all frequency offsets except for 'A', 'Y' and 'M' which all
for all frequency offsets except for 'A', 'YE' and 'ME' which all
have a default of 'right'.
on : Series, optional
For a DataFrame, column to use instead of index for resampling.
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/supported_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
MAX_MISSING_PARAMS_SIZE = 5
COMMON_PARAMETER_SET = {"kwargs", "args", "cls"}
MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)]
PANDAS_LATEST_VERSION = "2.1.4"
PANDAS_LATEST_VERSION = "2.2.0"

RST_HEADER = """
=====================
Expand Down
13 changes: 3 additions & 10 deletions python/pyspark/pandas/tests/computation/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,16 @@ def test_melt(self):
.sort_values(["variable_0", "variable_1", "value"])
.rename(columns=name_like_string),
)
self.assert_eq(
psdf.melt(
self.assertRaises(
ValueError,
lambda: psdf.melt(
id_vars=[(TEN, "A")],
value_vars=[(TEN, "B")],
var_name=["myV1", "myV2"],
value_name="myValname",
)
.sort_values(["myV1", "myV2", "myValname"])
.reset_index(drop=True),
pdf.melt(
id_vars=[(TEN, "A")],
value_vars=[(TEN, "B")],
var_name=["myV1", "myV2"],
value_name="myValname",
)
.sort_values(["myV1", "myV2", "myValname"])
.rename(columns=name_like_string),
)

columns.names = ["v0", "v1"]
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_add(self):

for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(b_pser + pser, b_psser + psser)
self.assert_eq(b_pser + pser, b_psser + psser, check_exact=False)
for col in self.non_numeric_df_cols:
pser, psser = pdf[col], psdf[col]
if col == "bool":
Expand All @@ -73,7 +73,7 @@ def test_sub(self):
self.assertRaises(TypeError, lambda: b_psser - True)

for col in self.numeric_df_cols:
self.assert_eq(b_pser - pdf[col], b_psser - psdf[col])
self.assert_eq(b_pser - pdf[col], b_psser - psdf[col], check_exact=False)

for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser - psdf[col])
Expand All @@ -90,7 +90,7 @@ def test_mul(self):
self.assert_eq(b_pser * False, b_psser * False)

for col in self.numeric_df_cols:
self.assert_eq(b_pser * pdf[col], b_psser * psdf[col])
self.assert_eq(b_pser * pdf[col], b_psser * psdf[col], check_exact=False)

for col in self.non_numeric_df_cols:
pser, psser = pdf[col], psdf[col]
Expand Down Expand Up @@ -145,7 +145,7 @@ def test_mod(self):
self.assertRaises(TypeError, lambda: b_psser % True)

for col in self.numeric_df_cols:
self.assert_eq(b_pser % pdf[col], b_psser % psdf[col])
self.assert_eq(b_pser % pdf[col], b_psser % psdf[col], check_exact=False)

for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser % psdf[col])
Expand Down
12 changes: 8 additions & 4 deletions python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,18 @@ def complex_psdf(self):
def test_add(self):
pdf, psdf = self.array_pdf, self.array_psdf
for col in self.array_df_cols:
self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col])
self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col], check_exact=False)

# Numeric array + Numeric array
for col in self.numeric_array_df_cols:
pser1, psser1 = pdf[col], psdf[col]
for other_col in self.numeric_array_df_cols:
pser2, psser2 = pdf[other_col], psdf[other_col]
self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values())
self.assert_eq(
(pser1 + pser2).sort_values(),
(psser1 + psser2).sort_values(),
check_exact=False,
)

# Non-numeric array + Non-numeric array
self.assertRaises(
Expand All @@ -130,7 +134,7 @@ def test_add(self):

for col in self.non_numeric_array_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser + pser, psser + psser)
self.assert_eq(pser + pser, psser + psser, check_exact=False)

# Numeric array + Non-numeric array
for numeric_col in self.numeric_array_df_cols:
Expand Down Expand Up @@ -240,7 +244,7 @@ def test_from_to_pandas(self):
pdf, psdf = self.array_pdf, self.array_psdf
for col in self.array_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser, psser._to_pandas())
self.assert_eq(pser, psser._to_pandas(), check_exact=False)
self.assert_eq(ps.from_pandas(pser), psser)

def test_isnull(self):
Expand Down
24 changes: 12 additions & 12 deletions python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,33 +44,33 @@ def test_add(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser + pser, psser + psser)
self.assert_eq(pser + 1, psser + 1)
self.assert_eq(pser + pser, psser + psser, check_exact=False)
self.assert_eq(pser + 1, psser + 1, check_exact=False)
# self.assert_eq(pser + 0.1, psser + 0.1)
self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool))
self.assert_eq(pser + True, psser + True)
self.assert_eq(pser + False, psser + False)
self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool), check_exact=False)
self.assert_eq(pser + True, psser + True, check_exact=False)
self.assert_eq(pser + False, psser + False, check_exact=False)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pser + pdf[n_col], psser + psdf[n_col])
self.assert_eq(pser + pdf[n_col], psser + psdf[n_col], check_exact=False)
else:
self.assertRaises(TypeError, lambda: psser + psdf[n_col])

def test_sub(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser - pser, psser - psser)
self.assert_eq(pser - 1, psser - 1)
self.assert_eq(pser - pser, psser - psser, check_exact=False)
self.assert_eq(pser - 1, psser - 1, check_exact=False)
# self.assert_eq(pser - 0.1, psser - 0.1)
self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool))
self.assert_eq(pser - True, psser - True)
self.assert_eq(pser - False, psser - False)
self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool), check_exact=False)
self.assert_eq(pser - True, psser - True, check_exact=False)
self.assert_eq(pser - False, psser - False, check_exact=False)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pser - pdf[n_col], psser - psdf[n_col])
self.assert_eq(pser - pdf[n_col], psser - psdf[n_col], check_exact=False)
else:
self.assertRaises(TypeError, lambda: psser - psdf[n_col])

Expand Down
Loading