apache · itholic · Jan 25, 2024 · Jan 29, 2024 · Jan 29, 2024 · Jan 29, 2024
diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \
     ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
     ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
-RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.1.4' scipy coverage matplotlib lxml
+RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.2.0' scipy coverage matplotlib lxml
 
 
-ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.1.4 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
+ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.2.0 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
 # Python deps for Spark Connect
 ARG CONNECT_PIP_PKGS="grpcio==1.59.3 grpcio-status==1.59.3 protobuf==4.25.1 googleapis-common-protos==1.56.4"
 

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -10607,8 +10607,10 @@ def melt(
                     name_like_string(name) if name is not None else "variable_{}".format(i)
                     for i, name in enumerate(self._internal.column_label_names)
                 ]
-        elif isinstance(var_name, str):
-            var_name = [var_name]
+        elif is_list_like(var_name):
+            raise ValueError(f"{var_name=} must be a scalar.")
+        else:
+            var_name = [var_name]  # type: ignore[list-item]
 
         pairs = F.explode(
             F.array(

diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
@@ -85,7 +85,6 @@
 from pyspark.pandas.frame import DataFrame, _reduce_spark_multi
 from pyspark.pandas.internal import (
     InternalFrame,
-    DEFAULT_SERIES_NAME,
     HIDDEN_COLUMNS,
     SPARK_INDEX_NAME_FORMAT,
 )
@@ -2554,7 +2553,7 @@ def resolve_func(psdf, this_column_labels, that_column_labels):
         if isinstance(obj, Series):
             num_series += 1
             series_names.add(obj.name)
-            new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME))
+            new_objs.append(obj.to_frame())
         else:
             assert isinstance(obj, DataFrame)
             new_objs.append(obj)

diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py
@@ -20,6 +20,7 @@
 import matplotlib as mat
 import numpy as np
 from matplotlib.axes._base import _process_plot_format  # type: ignore[attr-defined]
+from matplotlib.figure import Figure
 from pandas.core.dtypes.inference import is_list_like
 from pandas.io.formats.printing import pprint_thing
 from pandas.plotting._matplotlib import (  # type: ignore[attr-defined]
@@ -277,7 +278,7 @@ def _compute_plot_data(self):
 
         self.data = {labels[0]: stats}
 
-    def _make_plot(self):
+    def _make_plot(self, fig: Figure):
         bxpstats = list(self.data.values())[0]
         ax = self._get_ax(0)
         kwds = self.kwds.copy()
@@ -363,10 +364,23 @@ def _args_adjust(self):
         if is_list_like(self.bottom):
             self.bottom = np.array(self.bottom)
 
+    def _ensure_frame(self, data):
+        return data
+
+    def _calculate_bins(self, data, bins):
+        return bins
+
     def _compute_plot_data(self):
         self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins)
 
-    def _make_plot(self):
+    def _make_plot_keywords(self, kwds, y):
+        """merge BoxPlot/KdePlot properties to passed kwds"""
+        # y is required for KdePlot
+        kwds["bottom"] = self.bottom
+        kwds["bins"] = self.bins
+        return kwds
+
+    def _make_plot(self, fig: Figure):
         # TODO: this logic is similar to KdePlot. Might have to deduplicate it.
         # 'num_colors' requires to calculate `shape` which has to count all.
         # Use 1 for now to save the computation.
@@ -423,9 +437,9 @@ class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase):
     def __init__(self, data, **kwargs):
         super().__init__(self.get_top_n(data), **kwargs)
 
-    def _make_plot(self):
+    def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
-        super()._make_plot()
+        super()._make_plot(fig)
 
 
 class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
@@ -434,9 +448,9 @@ class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
     def __init__(self, data, **kwargs):
         super().__init__(self.get_sampled(data), **kwargs)
 
-    def _make_plot(self):
+    def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
-        super()._make_plot()
+        super()._make_plot(fig)
 
 
 class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
@@ -445,9 +459,9 @@ class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
     def __init__(self, data, **kwargs):
         super().__init__(self.get_sampled(data), **kwargs)
 
-    def _make_plot(self):
+    def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
-        super()._make_plot()
+        super()._make_plot(fig)
 
 
 class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
@@ -456,9 +470,9 @@ class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
     def __init__(self, data, **kwargs):
         super().__init__(self.get_top_n(data), **kwargs)
 
-    def _make_plot(self):
+    def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
-        super()._make_plot()
+        super()._make_plot(fig)
 
 
 class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
@@ -467,9 +481,9 @@ class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
     def __init__(self, data, x, y, **kwargs):
         super().__init__(self.get_top_n(data), x, y, **kwargs)
 
-    def _make_plot(self):
+    def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
-        super()._make_plot()
+        super()._make_plot(fig)
 
 
 class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
@@ -478,7 +492,12 @@ class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
     def _compute_plot_data(self):
         self.data = KdePlotBase.prepare_kde_data(self.data)
 
-    def _make_plot(self):
+    def _make_plot_keywords(self, kwds, y):
+        kwds["bw_method"] = self.bw_method
+        kwds["ind"] = type(self)._get_ind(y, ind=self.ind)
+        return kwds
+
+    def _make_plot(self, fig: Figure):
         # 'num_colors' requires to calculate `shape` which has to count all.
         # Use 1 for now to save the computation.
         colors = self._get_colors(num_colors=1)
@@ -515,8 +534,9 @@ def _make_plot(self):
                 self, "_append_legend_handles_labels"
             ) else self._add_legend_handle(artists[0], label, index=i)
 
-    def _get_ind(self, y):
-        return KdePlotBase.get_ind(y, self.ind)
+    @staticmethod
+    def _get_ind(y, ind):
+        return KdePlotBase.get_ind(y, ind)
 
     @classmethod
     def _plot(

diff --git a/python/pyspark/pandas/resample.py b/python/pyspark/pandas/resample.py
@@ -91,7 +91,8 @@ def __init__(
         self._resamplekey = resamplekey
 
         self._offset = to_offset(rule)
-        if self._offset.rule_code not in ["A-DEC", "M", "D", "H", "T", "S"]:
+
+        if self._offset.rule_code not in ["A-DEC", "ME", "D", "h", "min", "s"]:
             raise ValueError("rule code {} is not supported".format(self._offset.rule_code))
         if not getattr(self._offset, "n") > 0:
             raise ValueError("rule offset must be positive")
@@ -184,7 +185,7 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
                 )
             )
 
-        elif rule_code == "M":
+        elif rule_code == "ME":
             assert (
                 origin.is_month_end
                 and origin.hour == 0
@@ -264,8 +265,8 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
 
                 ret = F.when(edge_cond, edge_label).otherwise(non_edge_label)
 
-        elif rule_code in ["H", "T", "S"]:
-            unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"}
+        elif rule_code in ["h", "min", "s"]:
+            unit_mapping = {"h": "HOUR", "min": "MINUTE", "s": "SECOND"}
             unit_str = unit_mapping[rule_code]
 
             truncated_ts_scol = F.date_trunc(unit_str, ts_scol)
@@ -274,10 +275,10 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
             diff = timestampdiff(unit_str, origin_scol, truncated_ts_scol)
             mod = F.lit(0) if n == 1 else (diff % F.lit(n))
 
-            if rule_code == "H":
+            if rule_code == "h":
                 assert origin.minute == 0 and origin.second == 0
                 edge_cond = (mod == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0)
-            elif rule_code == "T":
+            elif rule_code == "min":
                 assert origin.second == 0
                 edge_cond = (mod == 0) & (F.second(ts_scol) == 0)
             else:

diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -7092,15 +7092,15 @@ def resample(
         ----------
         rule : str
             The offset string or object representing target conversion.
-            Currently, supported units are {'Y', 'A', 'M', 'D', 'H',
-            'T', 'MIN', 'S'}.
+            Currently, supported units are {'YE', 'A', 'ME', 'D', 'h',
+            'min', 'MIN', 's'}.
         closed : {{'right', 'left'}}, default None
             Which side of bin interval is closed. The default is 'left'
-            for all frequency offsets except for 'A', 'Y' and 'M' which all
+            for all frequency offsets except for 'A', 'YE' and 'ME' which all
             have a default of 'right'.
         label : {{'right', 'left'}}, default None
             Which bin edge label to label bucket with. The default is 'left'
-            for all frequency offsets except for 'A', 'Y' and 'M' which all
+            for all frequency offsets except for 'A', 'YE' and 'ME' which all
             have a default of 'right'.
         on : Series, optional
             For a DataFrame, column to use instead of index for resampling.

diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py
@@ -37,7 +37,7 @@
 MAX_MISSING_PARAMS_SIZE = 5
 COMMON_PARAMETER_SET = {"kwargs", "args", "cls"}
 MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)]
-PANDAS_LATEST_VERSION = "2.1.4"
+PANDAS_LATEST_VERSION = "2.2.0"
 
 RST_HEADER = """
 =====================

diff --git a/python/pyspark/pandas/tests/computation/test_melt.py b/python/pyspark/pandas/tests/computation/test_melt.py
@@ -100,23 +100,16 @@ def test_melt(self):
             .sort_values(["variable_0", "variable_1", "value"])
             .rename(columns=name_like_string),
         )
-        self.assert_eq(
-            psdf.melt(
+        self.assertRaises(
+            ValueError,
+            lambda: psdf.melt(
                 id_vars=[(TEN, "A")],
                 value_vars=[(TEN, "B")],
                 var_name=["myV1", "myV2"],
                 value_name="myValname",
             )
             .sort_values(["myV1", "myV2", "myValname"])
             .reset_index(drop=True),
-            pdf.melt(
-                id_vars=[(TEN, "A")],
-                value_vars=[(TEN, "B")],
-                var_name=["myV1", "myV2"],
-                value_name="myValname",
-            )
-            .sort_values(["myV1", "myV2", "myValname"])
-            .rename(columns=name_like_string),
         )
 
         columns.names = ["v0", "v1"]

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -54,7 +54,7 @@ def test_add(self):
 
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(b_pser + pser, b_psser + psser)
+            self.assert_eq(b_pser + pser, b_psser + psser, check_exact=False)
         for col in self.non_numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
             if col == "bool":
@@ -73,7 +73,7 @@ def test_sub(self):
         self.assertRaises(TypeError, lambda: b_psser - True)
 
         for col in self.numeric_df_cols:
-            self.assert_eq(b_pser - pdf[col], b_psser - psdf[col])
+            self.assert_eq(b_pser - pdf[col], b_psser - psdf[col], check_exact=False)
 
         for col in self.non_numeric_df_cols:
             self.assertRaises(TypeError, lambda: b_psser - psdf[col])
@@ -90,7 +90,7 @@ def test_mul(self):
         self.assert_eq(b_pser * False, b_psser * False)
 
         for col in self.numeric_df_cols:
-            self.assert_eq(b_pser * pdf[col], b_psser * psdf[col])
+            self.assert_eq(b_pser * pdf[col], b_psser * psdf[col], check_exact=False)
 
         for col in self.non_numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
@@ -145,7 +145,7 @@ def test_mod(self):
         self.assertRaises(TypeError, lambda: b_psser % True)
 
         for col in self.numeric_df_cols:
-            self.assert_eq(b_pser % pdf[col], b_psser % psdf[col])
+            self.assert_eq(b_pser % pdf[col], b_psser % psdf[col], check_exact=False)
 
         for col in self.non_numeric_df_cols:
             self.assertRaises(TypeError, lambda: b_psser % psdf[col])

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
@@ -105,14 +105,18 @@ def complex_psdf(self):
     def test_add(self):
         pdf, psdf = self.array_pdf, self.array_psdf
         for col in self.array_df_cols:
-            self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col])
+            self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col], check_exact=False)
 
         # Numeric array + Numeric array
         for col in self.numeric_array_df_cols:
             pser1, psser1 = pdf[col], psdf[col]
             for other_col in self.numeric_array_df_cols:
                 pser2, psser2 = pdf[other_col], psdf[other_col]
-                self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values())
+                self.assert_eq(
+                    (pser1 + pser2).sort_values(),
+                    (psser1 + psser2).sort_values(),
+                    check_exact=False,
+                )
 
         # Non-numeric array + Non-numeric array
         self.assertRaises(
@@ -130,7 +134,7 @@ def test_add(self):
 
         for col in self.non_numeric_array_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser + pser, psser + psser)
+            self.assert_eq(pser + pser, psser + psser, check_exact=False)
 
         # Numeric array + Non-numeric array
         for numeric_col in self.numeric_array_df_cols:
@@ -240,7 +244,7 @@ def test_from_to_pandas(self):
         pdf, psdf = self.array_pdf, self.array_psdf
         for col in self.array_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser, psser._to_pandas())
+            self.assert_eq(pser, psser._to_pandas(), check_exact=False)
             self.assert_eq(ps.from_pandas(pser), psser)
 
     def test_isnull(self):

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
@@ -44,33 +44,33 @@ def test_add(self):
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser + pser, psser + psser)
-            self.assert_eq(pser + 1, psser + 1)
+            self.assert_eq(pser + pser, psser + psser, check_exact=False)
+            self.assert_eq(pser + 1, psser + 1, check_exact=False)
             # self.assert_eq(pser + 0.1, psser + 0.1)
-            self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool))
-            self.assert_eq(pser + True, psser + True)
-            self.assert_eq(pser + False, psser + False)
+            self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool), check_exact=False)
+            self.assert_eq(pser + True, psser + True, check_exact=False)
+            self.assert_eq(pser + False, psser + False, check_exact=False)
 
             for n_col in self.non_numeric_df_cols:
                 if n_col == "bool":
-                    self.assert_eq(pser + pdf[n_col], psser + psdf[n_col])
+                    self.assert_eq(pser + pdf[n_col], psser + psdf[n_col], check_exact=False)
                 else:
                     self.assertRaises(TypeError, lambda: psser + psdf[n_col])
 
     def test_sub(self):
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser - pser, psser - psser)
-            self.assert_eq(pser - 1, psser - 1)
+            self.assert_eq(pser - pser, psser - psser, check_exact=False)
+            self.assert_eq(pser - 1, psser - 1, check_exact=False)
             # self.assert_eq(pser - 0.1, psser - 0.1)
-            self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool))
-            self.assert_eq(pser - True, psser - True)
-            self.assert_eq(pser - False, psser - False)
+            self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool), check_exact=False)
+            self.assert_eq(pser - True, psser - True, check_exact=False)
+            self.assert_eq(pser - False, psser - False, check_exact=False)
 
             for n_col in self.non_numeric_df_cols:
                 if n_col == "bool":
-                    self.assert_eq(pser - pdf[n_col], psser - psdf[n_col])
+                    self.assert_eq(pser - pdf[n_col], psser - psdf[n_col], check_exact=False)
                 else:
                     self.assertRaises(TypeError, lambda: psser - psdf[n_col])