Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-46858][PYTHON][PS][BUILD] Upgrade Pandas to 2.2.0 #44881

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
9ae857a
[SPARK-46858][PYTHON][PS][INFRA] Upgrade Pandas to 2.2.0
itholic Jan 25, 2024
5caa678
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Jan 29, 2024
e9a6445
pin version
itholic Jan 29, 2024
edb3d9a
fix series default name issue
itholic Jan 29, 2024
5440381
upperbound for PyPy3
itholic Jan 30, 2024
3e66505
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Feb 13, 2024
8643ebd
Fix melt
itholic Feb 13, 2024
a8237b4
Fix test util related changes
itholic Feb 13, 2024
836dcfe
Fix more test utils
itholic Feb 13, 2024
d3c5f57
Fix resample test
itholic Feb 13, 2024
66f69a2
Rule code mapping
itholic Feb 14, 2024
9d4e8a1
Fix booleanops tests
itholic Feb 14, 2024
37300e8
use proper rule code
itholic Feb 14, 2024
ea57fdb
Fix unsupported cases
itholic Feb 14, 2024
a3f3e91
Fix Categorical test
itholic Feb 15, 2024
8a24900
Fix SparkConnectFunctionsTests
itholic Feb 15, 2024
b727550
Fix linter
itholic Feb 15, 2024
e92082f
Fix NumOpsTests
itholic Feb 16, 2024
5f62fcc
Fix FrameReshapingTests
itholic Feb 16, 2024
f235780
ResampleSeriesTests
itholic Feb 16, 2024
ad67735
Fix ReverseTests
itholic Feb 16, 2024
4e6c77a
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Feb 16, 2024
26b7bd6
revert unrelated changes
itholic Feb 16, 2024
4c84b2a
Fix plotting
itholic Feb 16, 2024
fbbaf88
Fix BoxPlot
itholic Feb 19, 2024
0ca4aa6
Fix concat bug in Pandas
itholic Feb 19, 2024
7536263
Fix DataFrame hist plot
itholic Feb 19, 2024
b07e608
Merge branch 'master' of https://github.com/apache/spark into pandas_…
itholic Feb 20, 2024
acd7b7f
Add release note
itholic Feb 20, 2024
d560825
has -> have
itholic Feb 20, 2024
6de7931
Make resample work in old pandas as well
itholic Feb 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \
ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.1.4' scipy coverage matplotlib lxml
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.2.0' scipy coverage matplotlib lxml
itholic marked this conversation as resolved.
Show resolved Hide resolved


ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.1.4 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.2.0 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
# Python deps for Spark Connect
ARG CONNECT_PIP_PKGS="grpcio==1.59.3 grpcio-status==1.59.3 protobuf==4.25.1 googleapis-common-protos==1.56.4"

Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10607,7 +10607,9 @@ def melt(
name_like_string(name) if name is not None else "variable_{}".format(i)
for i, name in enumerate(self._internal.column_label_names)
]
elif isinstance(var_name, str):
elif is_list_like(var_name):
raise ValueError(f"{var_name=} must be a scalar.")
else:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

var_name = [var_name]

pairs = F.explode(
Expand Down
3 changes: 1 addition & 2 deletions python/pyspark/pandas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@
from pyspark.pandas.frame import DataFrame, _reduce_spark_multi
from pyspark.pandas.internal import (
InternalFrame,
DEFAULT_SERIES_NAME,
HIDDEN_COLUMNS,
SPARK_INDEX_NAME_FORMAT,
)
Expand Down Expand Up @@ -2554,7 +2553,7 @@ def resolve_func(psdf, this_column_labels, that_column_labels):
if isinstance(obj, Series):
num_series += 1
series_names.add(obj.name)
new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME))
new_objs.append(obj.to_frame())
itholic marked this conversation as resolved.
Show resolved Hide resolved
else:
assert isinstance(obj, DataFrame)
new_objs.append(obj)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/supported_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
MAX_MISSING_PARAMS_SIZE = 5
COMMON_PARAMETER_SET = {"kwargs", "args", "cls"}
MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)]
PANDAS_LATEST_VERSION = "2.1.4"
PANDAS_LATEST_VERSION = "2.2.0"

RST_HEADER = """
=====================
Expand Down
13 changes: 3 additions & 10 deletions python/pyspark/pandas/tests/computation/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,16 @@ def test_melt(self):
.sort_values(["variable_0", "variable_1", "value"])
.rename(columns=name_like_string),
)
self.assert_eq(
psdf.melt(
self.assertRaises(
ValueError,
lambda: psdf.melt(
id_vars=[(TEN, "A")],
value_vars=[(TEN, "B")],
var_name=["myV1", "myV2"],
value_name="myValname",
)
.sort_values(["myV1", "myV2", "myValname"])
.reset_index(drop=True),
pdf.melt(
id_vars=[(TEN, "A")],
value_vars=[(TEN, "B")],
var_name=["myV1", "myV2"],
value_name="myValname",
)
.sort_values(["myV1", "myV2", "myValname"])
.rename(columns=name_like_string),
)

columns.names = ["v0", "v1"]
Expand Down
24 changes: 12 additions & 12 deletions python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,33 +44,33 @@ def test_add(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser + pser, psser + psser)
self.assert_eq(pser + 1, psser + 1)
self.assert_eq(pser + pser, psser + psser, check_exact=False)
self.assert_eq(pser + 1, psser + 1, check_exact=False)
# self.assert_eq(pser + 0.1, psser + 0.1)
self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool))
self.assert_eq(pser + True, psser + True)
self.assert_eq(pser + False, psser + False)
self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool), check_exact=False)
self.assert_eq(pser + True, psser + True, check_exact=False)
self.assert_eq(pser + False, psser + False, check_exact=False)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pser + pdf[n_col], psser + psdf[n_col])
self.assert_eq(pser + pdf[n_col], psser + psdf[n_col], check_exact=False)
else:
self.assertRaises(TypeError, lambda: psser + psdf[n_col])

def test_sub(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser - pser, psser - psser)
self.assert_eq(pser - 1, psser - 1)
self.assert_eq(pser - pser, psser - psser, check_exact=False)
self.assert_eq(pser - 1, psser - 1, check_exact=False)
# self.assert_eq(pser - 0.1, psser - 0.1)
self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool))
self.assert_eq(pser - True, psser - True)
self.assert_eq(pser - False, psser - False)
self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool), check_exact=False)
self.assert_eq(pser - True, psser - True, check_exact=False)
self.assert_eq(pser - False, psser - False, check_exact=False)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pser - pdf[n_col], psser - psdf[n_col])
self.assert_eq(pser - pdf[n_col], psser - psdf[n_col], check_exact=False)
else:
self.assertRaises(TypeError, lambda: psser - psdf[n_col])

Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/pandas/tests/indexes/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ def test_to_series(self):
psidx = self.psdf.set_index("b", append=True).index

with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
self.assert_eq(psidx.to_series(), pidx.to_series())
self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
self.assert_eq(psidx.to_series(), pidx.to_series(), check_exact=False)
self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"), check_exact=False)

expected_error_message = "Series.name must be a hashable type"
with self.assertRaisesRegex(TypeError, expected_error_message):
Expand Down
9 changes: 9 additions & 0 deletions python/pyspark/pandas/tests/resample/test_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,21 @@ def test_resample_error(self):
psdf.A.resample("3W").sum()

with self.assertRaisesRegex(ValueError, "rule offset must be positive"):
psdf.A.resample("0D").sum()

with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"):
psdf.A.resample("0Y").sum()

with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"):
psdf.A.resample("3D", closed="middle").sum()

with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"):
psdf.A.resample("3Y", closed="middle").sum()

with self.assertRaisesRegex(ValueError, "invalid label: 'both'"):
psdf.A.resample("3D", label="both").sum()

with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"):
psdf.A.resample("3Y", label="both").sum()

with self.assertRaisesRegex(
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/sql/tests/connect/test_connect_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2244,6 +2244,7 @@ def test_stat_freq_items(self):
self.assert_eq(
self.connect.read.table(self.tbl_name2).stat.freqItems(["col1", "col3"]).toPandas(),
self.spark.read.table(self.tbl_name2).stat.freqItems(["col1", "col3"]).toPandas(),
check_exact=False,
)

self.assert_eq(
Expand Down