diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index fa663bc6e419a..eaeed51f90cdf 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \ ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \ ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 -RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.1.4' scipy coverage matplotlib lxml +RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.2.0' scipy coverage matplotlib lxml -ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.1.4 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas<=2.2.0 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" # Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.59.3 grpcio-status==1.59.3 protobuf==4.25.1 googleapis-common-protos==1.56.4" diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 9ef04814ef825..1ca5d7aad5d16 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -69,6 +69,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, ``Series.dt.week`` and ``Series.dt.weekofyear`` have been removed from Pandas API on Spark, use ``Series.dt.isocalendar().week`` instead. * In Spark 4.0, when applying ``astype`` to a decimal type object, the existing missing value is changed to ``True`` instead of ``False`` from Pandas API on Spark. * In Spark 4.0, ``pyspark.testing.assertPandasOnSparkEqual`` has been removed from Pandas API on Spark, use ``pyspark.pandas.testing.assert_frame_equal`` instead. +* In Spark 4.0, the aliases ``Y``, ``M``, ``H``, ``T``, ``S`` have been deprecated from Pandas API on Spark, use ``YE``, ``ME``, ``h``, ``min``, ``s`` instead respectively. diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index ddc26a67802ec..28ffa20d65c45 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -10609,8 +10609,10 @@ def melt( name_like_string(name) if name is not None else "variable_{}".format(i) for i, name in enumerate(self._internal.column_label_names) ] - elif isinstance(var_name, str): - var_name = [var_name] + elif is_list_like(var_name): + raise ValueError(f"{var_name=} must be a scalar.") + else: + var_name = [var_name] # type: ignore[list-item] pairs = F.explode( F.array( diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index f6641b558f0aa..42a0ce49faa56 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -2554,7 +2554,10 @@ def resolve_func(psdf, this_column_labels, that_column_labels): if isinstance(obj, Series): num_series += 1 series_names.add(obj.name) - new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME)) + if not ignore_index and not should_return_series: + new_objs.append(obj.to_frame()) + else: + new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME)) else: assert isinstance(obj, DataFrame) new_objs.append(obj) diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index fe23f4571879c..f496f2bc664be 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -15,11 +15,14 @@ # limitations under the License. # +from typing import final + from pyspark.loose_version import LooseVersion import matplotlib as mat import numpy as np from matplotlib.axes._base import _process_plot_format # type: ignore[attr-defined] +from matplotlib.figure import Figure from pandas.core.dtypes.inference import is_list_like from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import ( # type: ignore[attr-defined] @@ -44,10 +47,29 @@ unsupported_function, KdePlotBase, ) +from pyspark.pandas.series import Series, first_series _all_kinds = PlotAccessor._all_kinds # type: ignore[attr-defined] +def _set_ticklabels(ax, labels, is_vertical, **kwargs) -> None: + """Set the tick labels of a given axis. + + Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the + case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of + labels. + """ + ticks = ax.get_xticks() if is_vertical else ax.get_yticks() + if len(ticks) != len(labels): + i, remainder = divmod(len(ticks), len(labels)) + assert remainder == 0, remainder + labels *= i + if is_vertical: + ax.set_xticklabels(labels, **kwargs) + else: + ax.set_yticklabels(labels, **kwargs) + + class PandasOnSparkBarPlot(PandasBarPlot, TopNPlotBase): _kind = "bar" @@ -231,10 +253,23 @@ def _plot(self, ax, bxpstats, column_num=None, return_type="axes", **kwds): else: return ax, bp + @final + def _ensure_frame(self, data): + if isinstance(data, Series): + label = self.label + if label is None and data.name is None: + label = "" + if label is None: + data = data.to_frame() + else: + data = data.to_frame(name=label) + return data + def _compute_plot_data(self): - colname = self.data.name - spark_column_name = self.data._internal.spark_column_name_for(self.data._column_label) data = self.data + data = first_series(data) if not isinstance(data, Series) else data + colname = data.name + spark_column_name = data._internal.spark_column_name_for(data._column_label) # Updates all props with the rc defaults from matplotlib self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds)) @@ -277,7 +312,7 @@ def _compute_plot_data(self): self.data = {labels[0]: stats} - def _make_plot(self): + def _make_plot(self, fig: Figure): bxpstats = list(self.data.values())[0] ax = self._get_ax(0) kwds = self.kwds.copy() @@ -303,7 +338,7 @@ def _make_plot(self): labels = [pprint_thing(lbl) for lbl in labels] if not self.use_index: labels = [pprint_thing(key) for key in range(len(labels))] - self._set_ticklabels(ax, labels) + _set_ticklabels(ax, labels, self.orientation == "vertical") @staticmethod def rc_defaults( @@ -363,10 +398,32 @@ def _args_adjust(self): if is_list_like(self.bottom): self.bottom = np.array(self.bottom) + @final + def _ensure_frame(self, data): + if isinstance(data, Series): + label = self.label + if label is None and data.name is None: + label = "" + if label is None: + data = data.to_frame() + else: + data = data.to_frame(name=label) + return data + + def _calculate_bins(self, data, bins): + return bins + def _compute_plot_data(self): self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins) - def _make_plot(self): + def _make_plot_keywords(self, kwds, y): + """merge BoxPlot/KdePlot properties to passed kwds""" + # y is required for KdePlot + kwds["bottom"] = self.bottom + kwds["bins"] = self.bins + return kwds + + def _make_plot(self, fig: Figure): # TODO: this logic is similar to KdePlot. Might have to deduplicate it. # 'num_colors' requires to calculate `shape` which has to count all. # Use 1 for now to save the computation. @@ -423,9 +480,9 @@ class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase): def __init__(self, data, **kwargs): super().__init__(self.get_top_n(data), **kwargs) - def _make_plot(self): + def _make_plot(self, fig: Figure): self.set_result_text(self._get_ax(0)) - super()._make_plot() + super()._make_plot(fig) class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase): @@ -434,9 +491,9 @@ class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase): def __init__(self, data, **kwargs): super().__init__(self.get_sampled(data), **kwargs) - def _make_plot(self): + def _make_plot(self, fig: Figure): self.set_result_text(self._get_ax(0)) - super()._make_plot() + super()._make_plot(fig) class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase): @@ -445,9 +502,9 @@ class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase): def __init__(self, data, **kwargs): super().__init__(self.get_sampled(data), **kwargs) - def _make_plot(self): + def _make_plot(self, fig: Figure): self.set_result_text(self._get_ax(0)) - super()._make_plot() + super()._make_plot(fig) class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase): @@ -456,9 +513,9 @@ class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase): def __init__(self, data, **kwargs): super().__init__(self.get_top_n(data), **kwargs) - def _make_plot(self): + def _make_plot(self, fig: Figure): self.set_result_text(self._get_ax(0)) - super()._make_plot() + super()._make_plot(fig) class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase): @@ -467,9 +524,9 @@ class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase): def __init__(self, data, x, y, **kwargs): super().__init__(self.get_top_n(data), x, y, **kwargs) - def _make_plot(self): + def _make_plot(self, fig: Figure): self.set_result_text(self._get_ax(0)) - super()._make_plot() + super()._make_plot(fig) class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase): @@ -478,7 +535,12 @@ class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase): def _compute_plot_data(self): self.data = KdePlotBase.prepare_kde_data(self.data) - def _make_plot(self): + def _make_plot_keywords(self, kwds, y): + kwds["bw_method"] = self.bw_method + kwds["ind"] = type(self)._get_ind(y, ind=self.ind) + return kwds + + def _make_plot(self, fig: Figure): # 'num_colors' requires to calculate `shape` which has to count all. # Use 1 for now to save the computation. colors = self._get_colors(num_colors=1) @@ -515,8 +577,9 @@ def _make_plot(self): self, "_append_legend_handles_labels" ) else self._add_legend_handle(artists[0], label, index=i) - def _get_ind(self, y): - return KdePlotBase.get_ind(y, self.ind) + @staticmethod + def _get_ind(y, ind): + return KdePlotBase.get_ind(y, ind) @classmethod def _plot( diff --git a/python/pyspark/pandas/resample.py b/python/pyspark/pandas/resample.py index fdcfa3243c00f..9683fc4f4e7ff 100644 --- a/python/pyspark/pandas/resample.py +++ b/python/pyspark/pandas/resample.py @@ -91,20 +91,21 @@ def __init__( self._resamplekey = resamplekey self._offset = to_offset(rule) - if self._offset.rule_code not in ["A-DEC", "M", "D", "H", "T", "S"]: + + if self._offset.rule_code not in ["A-DEC", "M", "ME", "D", "H", "h", "T", "min", "S", "s"]: raise ValueError("rule code {} is not supported".format(self._offset.rule_code)) if not getattr(self._offset, "n") > 0: raise ValueError("rule offset must be positive") if closed is None: - self._closed = "right" if self._offset.rule_code in ["A-DEC", "M"] else "left" + self._closed = "right" if self._offset.rule_code in ["A-DEC", "M", "ME"] else "left" elif closed in ["left", "right"]: self._closed = closed else: raise ValueError("invalid closed: '{}'".format(closed)) if label is None: - self._label = "right" if self._offset.rule_code in ["A-DEC", "M"] else "left" + self._label = "right" if self._offset.rule_code in ["A-DEC", "M", "ME"] else "left" elif label in ["left", "right"]: self._label = label else: @@ -184,7 +185,7 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column: ) ) - elif rule_code == "M": + elif rule_code in ["ME", "M"]: assert ( origin.is_month_end and origin.hour == 0 @@ -264,8 +265,15 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column: ret = F.when(edge_cond, edge_label).otherwise(non_edge_label) - elif rule_code in ["H", "T", "S"]: - unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"} + elif rule_code in ["h", "min", "s", "H", "T", "S"]: + unit_mapping = { + "h": "HOUR", + "min": "MINUTE", + "s": "SECOND", + "H": "HOUR", + "T": "MINUTE", + "S": "SECOND", + } unit_str = unit_mapping[rule_code] truncated_ts_scol = F.date_trunc(unit_str, ts_scol) @@ -274,10 +282,10 @@ def _bin_timestamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column: diff = timestampdiff(unit_str, origin_scol, truncated_ts_scol) mod = F.lit(0) if n == 1 else (diff % F.lit(n)) - if rule_code == "H": + if rule_code in ["h", "H"]: assert origin.minute == 0 and origin.second == 0 edge_cond = (mod == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0) - elif rule_code == "T": + elif rule_code in ["min", "T"]: assert origin.second == 0 edge_cond = (mod == 0) & (F.second(ts_scol) == 0) else: diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index a0e4ecc40d5e5..98818a368a9f9 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -7092,15 +7092,15 @@ def resample( ---------- rule : str The offset string or object representing target conversion. - Currently, supported units are {'Y', 'A', 'M', 'D', 'H', - 'T', 'MIN', 'S'}. + Currently, supported units are {'YE', 'A', 'ME', 'D', 'h', + 'min', 'MIN', 's'}. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'A', 'Y' and 'M' which all + for all frequency offsets except for 'A', 'YE' and 'ME' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'A', 'Y' and 'M' which all + for all frequency offsets except for 'A', 'YE' and 'ME' which all have a default of 'right'. on : Series, optional For a DataFrame, column to use instead of index for resampling. diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index 102405f8376f2..7712717881eac 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -38,7 +38,7 @@ MAX_MISSING_PARAMS_SIZE = 5 COMMON_PARAMETER_SET = {"kwargs", "args", "cls"} MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)] -PANDAS_LATEST_VERSION = "2.1.4" +PANDAS_LATEST_VERSION = "2.2.0" RST_HEADER = """ ===================== diff --git a/python/pyspark/pandas/tests/computation/test_melt.py b/python/pyspark/pandas/tests/computation/test_melt.py index 982ab76045834..844c4bd4e9b89 100644 --- a/python/pyspark/pandas/tests/computation/test_melt.py +++ b/python/pyspark/pandas/tests/computation/test_melt.py @@ -100,8 +100,9 @@ def test_melt(self): .sort_values(["variable_0", "variable_1", "value"]) .rename(columns=name_like_string), ) - self.assert_eq( - psdf.melt( + self.assertRaises( + ValueError, + lambda: psdf.melt( id_vars=[(TEN, "A")], value_vars=[(TEN, "B")], var_name=["myV1", "myV2"], @@ -109,14 +110,6 @@ def test_melt(self): ) .sort_values(["myV1", "myV2", "myValname"]) .reset_index(drop=True), - pdf.melt( - id_vars=[(TEN, "A")], - value_vars=[(TEN, "B")], - var_name=["myV1", "myV2"], - value_name="myValname", - ) - .sort_values(["myV1", "myV2", "myValname"]) - .rename(columns=name_like_string), ) columns.names = ["v0", "v1"] diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py index bb8067530d643..f9ec58b279a2d 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py @@ -54,7 +54,7 @@ def test_add(self): for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(b_pser + pser, b_psser + psser) + self.assert_eq(b_pser + pser, b_psser + psser, check_exact=False) for col in self.non_numeric_df_cols: pser, psser = pdf[col], psdf[col] if col == "bool": @@ -73,7 +73,7 @@ def test_sub(self): self.assertRaises(TypeError, lambda: b_psser - True) for col in self.numeric_df_cols: - self.assert_eq(b_pser - pdf[col], b_psser - psdf[col]) + self.assert_eq(b_pser - pdf[col], b_psser - psdf[col], check_exact=False) for col in self.non_numeric_df_cols: self.assertRaises(TypeError, lambda: b_psser - psdf[col]) @@ -90,7 +90,7 @@ def test_mul(self): self.assert_eq(b_pser * False, b_psser * False) for col in self.numeric_df_cols: - self.assert_eq(b_pser * pdf[col], b_psser * psdf[col]) + self.assert_eq(b_pser * pdf[col], b_psser * psdf[col], check_exact=False) for col in self.non_numeric_df_cols: pser, psser = pdf[col], psdf[col] @@ -145,7 +145,7 @@ def test_mod(self): self.assertRaises(TypeError, lambda: b_psser % True) for col in self.numeric_df_cols: - self.assert_eq(b_pser % pdf[col], b_psser % psdf[col]) + self.assert_eq(b_pser % pdf[col], b_psser % psdf[col], check_exact=False) for col in self.non_numeric_df_cols: self.assertRaises(TypeError, lambda: b_psser % psdf[col]) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py index 535fda1359b82..982050c404017 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py @@ -105,14 +105,18 @@ def complex_psdf(self): def test_add(self): pdf, psdf = self.array_pdf, self.array_psdf for col in self.array_df_cols: - self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col]) + self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col], check_exact=False) # Numeric array + Numeric array for col in self.numeric_array_df_cols: pser1, psser1 = pdf[col], psdf[col] for other_col in self.numeric_array_df_cols: pser2, psser2 = pdf[other_col], psdf[other_col] - self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values()) + self.assert_eq( + (pser1 + pser2).sort_values(), + (psser1 + psser2).sort_values(), + check_exact=False, + ) # Non-numeric array + Non-numeric array self.assertRaises( @@ -130,7 +134,7 @@ def test_add(self): for col in self.non_numeric_array_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser + pser, psser + psser) + self.assert_eq(pser + pser, psser + psser, check_exact=False) # Numeric array + Non-numeric array for numeric_col in self.numeric_array_df_cols: @@ -240,7 +244,7 @@ def test_from_to_pandas(self): pdf, psdf = self.array_pdf, self.array_psdf for col in self.array_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser, psser._to_pandas()) + self.assert_eq(pser, psser._to_pandas(), check_exact=False) self.assert_eq(ps.from_pandas(pser), psser) def test_isnull(self): diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py index f79691646ec12..a982c5e148b4f 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py @@ -44,16 +44,16 @@ def test_add(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser + pser, psser + psser) - self.assert_eq(pser + 1, psser + 1) + self.assert_eq(pser + pser, psser + psser, check_exact=False) + self.assert_eq(pser + 1, psser + 1, check_exact=False) # self.assert_eq(pser + 0.1, psser + 0.1) - self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool)) - self.assert_eq(pser + True, psser + True) - self.assert_eq(pser + False, psser + False) + self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool), check_exact=False) + self.assert_eq(pser + True, psser + True, check_exact=False) + self.assert_eq(pser + False, psser + False, check_exact=False) for n_col in self.non_numeric_df_cols: if n_col == "bool": - self.assert_eq(pser + pdf[n_col], psser + psdf[n_col]) + self.assert_eq(pser + pdf[n_col], psser + psdf[n_col], check_exact=False) else: self.assertRaises(TypeError, lambda: psser + psdf[n_col]) @@ -61,16 +61,16 @@ def test_sub(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser - pser, psser - psser) - self.assert_eq(pser - 1, psser - 1) + self.assert_eq(pser - pser, psser - psser, check_exact=False) + self.assert_eq(pser - 1, psser - 1, check_exact=False) # self.assert_eq(pser - 0.1, psser - 0.1) - self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool)) - self.assert_eq(pser - True, psser - True) - self.assert_eq(pser - False, psser - False) + self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool), check_exact=False) + self.assert_eq(pser - True, psser - True, check_exact=False) + self.assert_eq(pser - False, psser - False, check_exact=False) for n_col in self.non_numeric_df_cols: if n_col == "bool": - self.assert_eq(pser - pdf[n_col], psser - psdf[n_col]) + self.assert_eq(pser - pdf[n_col], psser - psdf[n_col], check_exact=False) else: self.assertRaises(TypeError, lambda: psser - psdf[n_col]) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py index 7937097af1572..9f4e5108810fd 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py @@ -38,9 +38,9 @@ def test_mod(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser % pser, psser % psser) - self.assert_eq(pser % pser.astype(bool), psser % psser.astype(bool)) - self.assert_eq(pser % True, psser % True) + self.assert_eq(pser % pser, psser % psser, check_exact=False) + self.assert_eq(pser % pser.astype(bool), psser % psser.astype(bool), check_exact=False) + self.assert_eq(pser % True, psser % True, check_exact=False) if col in ["int", "int32"]: self.assert_eq( pd.Series([np.nan, np.nan, np.nan], dtype=float, name=col), psser % False diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_mul_div.py b/python/pyspark/pandas/tests/data_type_ops/test_num_mul_div.py index 3096373fff20e..8a719fe9fd88c 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_mul_div.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_mul_div.py @@ -38,17 +38,17 @@ def test_mul(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser * pser, psser * psser) - self.assert_eq(pser * pser.astype(bool), psser * psser.astype(bool)) - self.assert_eq(pser * True, psser * True) - self.assert_eq(pser * False, psser * False) + self.assert_eq(pser * pser, psser * psser, check_exact=False) + self.assert_eq(pser * pser.astype(bool), psser * psser.astype(bool), check_exact=False) + self.assert_eq(pser * True, psser * True, check_exact=False) + self.assert_eq(pser * False, psser * False, check_exact=False) if psser.dtype in [int, np.int32]: self.assert_eq(pser * pdf["string"], psser * psdf["string"]) else: self.assertRaises(TypeError, lambda: psser * psdf["string"]) - self.assert_eq(pser * pdf["bool"], psser * psdf["bool"]) + self.assert_eq(pser * pdf["bool"], psser * psdf["bool"], check_exact=False) self.assertRaises(TypeError, lambda: psser * psdf["datetime"]) self.assertRaises(TypeError, lambda: psser * psdf["date"]) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index e7b157cabb2f0..03a794771a910 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -101,7 +101,7 @@ def test_from_to_pandas(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(pser, psser._to_pandas()) + self.assert_eq(pser, psser._to_pandas(), check_exact=False) self.assert_eq(ps.from_pandas(pser), psser) def test_isnull(self): @@ -112,12 +112,12 @@ def test_isnull(self): def test_neg(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: - self.assert_eq(-pdf[col], -psdf[col]) + self.assert_eq(-pdf[col], -psdf[col], check_exact=False) def test_abs(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: - self.assert_eq(abs(pdf[col]), abs(psdf[col])) + self.assert_eq(abs(pdf[col]), abs(psdf[col]), check_exact=False) def test_invert(self): pdf, psdf = self.pdf, self.psdf diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py b/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py index e60fa1e781f08..a14177348cd00 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py @@ -45,11 +45,11 @@ def test_radd(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(1 + pser, 1 + psser) + self.assert_eq(1 + pser, 1 + psser, check_exact=False) # self.assert_eq(0.1 + pser, 0.1 + psser) self.assertRaises(TypeError, lambda: "x" + psser) - self.assert_eq(True + pser, True + psser) - self.assert_eq(False + pser, False + psser) + self.assert_eq(True + pser, True + psser, check_exact=False) + self.assert_eq(False + pser, False + psser, check_exact=False) self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + psser) self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) + psser) @@ -57,11 +57,11 @@ def test_rsub(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(1 - pser, 1 - psser) + self.assert_eq(1 - pser, 1 - psser, check_exact=False) # self.assert_eq(0.1 - pser, 0.1 - psser) self.assertRaises(TypeError, lambda: "x" - psser) - self.assert_eq(True - pser, True - psser) - self.assert_eq(False - pser, False - psser) + self.assert_eq(True - pser, True - psser, check_exact=False) + self.assert_eq(False - pser, False - psser, check_exact=False) self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - psser) self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) - psser) @@ -69,11 +69,11 @@ def test_rmul(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(1 * pser, 1 * psser) + self.assert_eq(1 * pser, 1 * psser, check_exact=False) # self.assert_eq(0.1 * pser, 0.1 * psser) self.assertRaises(TypeError, lambda: "x" * psser) - self.assert_eq(True * pser, True * psser) - self.assert_eq(False * pser, False * psser) + self.assert_eq(True * pser, True * psser, check_exact=False) + self.assert_eq(False * pser, False * psser, check_exact=False) self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) * psser) self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) * psser) @@ -117,10 +117,10 @@ def test_rmod(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - self.assert_eq(1 % pser, 1 % psser) + self.assert_eq(1 % pser, 1 % psser, check_exact=False) # self.assert_eq(0.1 % pser, 0.1 % psser) - self.assert_eq(True % pser, True % psser) - self.assert_eq(False % pser, False % psser) + self.assert_eq(True % pser, True % psser, check_exact=False) + self.assert_eq(False % pser, False % psser, check_exact=False) self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % psser) self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) % psser) diff --git a/python/pyspark/pandas/tests/frame/test_reshaping.py b/python/pyspark/pandas/tests/frame/test_reshaping.py index b7c732f113d50..05195b485fb0a 100644 --- a/python/pyspark/pandas/tests/frame/test_reshaping.py +++ b/python/pyspark/pandas/tests/frame/test_reshaping.py @@ -327,7 +327,7 @@ def test_explode(self): ) self.assert_eq(result1, expected_result1, almost=True) - self.assert_eq(result2, expected_result2) + self.assert_eq(result2, expected_result2, check_exact=False) self.assert_eq(result1.index.name, expected_result1.index.name) self.assert_eq(result1.columns.name, expected_result1.columns.name) self.assert_eq(result3, expected_result3, almost=True) @@ -349,7 +349,7 @@ def test_explode(self): ) self.assert_eq(result1, expected_result1, almost=True) - self.assert_eq(result2, expected_result2) + self.assert_eq(result2, expected_result2, check_exact=False) self.assert_eq(result1.index.names, expected_result1.index.names) self.assert_eq(result1.columns.name, expected_result1.columns.name) self.assert_eq(result3, expected_result3, almost=True) @@ -367,7 +367,7 @@ def test_explode(self): expected_result3, result3 = pdf.A.explode("Z"), psdf.A.explode("Z") self.assert_eq(result1, expected_result1, almost=True) - self.assert_eq(result2, expected_result2) + self.assert_eq(result2, expected_result2, check_exact=False) self.assert_eq(result1.index.names, expected_result1.index.names) self.assert_eq(result1.columns.names, expected_result1.columns.names) self.assert_eq(result3, expected_result3, almost=True) diff --git a/python/pyspark/pandas/tests/indexes/test_conversion.py b/python/pyspark/pandas/tests/indexes/test_conversion.py index 5790fb66ecf9b..9759a3d06a759 100644 --- a/python/pyspark/pandas/tests/indexes/test_conversion.py +++ b/python/pyspark/pandas/tests/indexes/test_conversion.py @@ -195,8 +195,8 @@ def test_to_series(self): psidx = self.psdf.set_index("b", append=True).index with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + self.assert_eq(psidx.to_series(), pidx.to_series(), check_exact=False) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"), check_exact=False) expected_error_message = "Series.name must be a hashable type" with self.assertRaisesRegex(TypeError, expected_error_message): diff --git a/python/pyspark/pandas/tests/resample/test_error.py b/python/pyspark/pandas/tests/resample/test_error.py index 15b5df7b3b80c..018930db5b1bd 100644 --- a/python/pyspark/pandas/tests/resample/test_error.py +++ b/python/pyspark/pandas/tests/resample/test_error.py @@ -52,12 +52,21 @@ def test_resample_error(self): psdf.A.resample("3W").sum() with self.assertRaisesRegex(ValueError, "rule offset must be positive"): + psdf.A.resample("0D").sum() + + with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"): psdf.A.resample("0Y").sum() with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"): + psdf.A.resample("3D", closed="middle").sum() + + with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"): psdf.A.resample("3Y", closed="middle").sum() with self.assertRaisesRegex(ValueError, "invalid label: 'both'"): + psdf.A.resample("3D", label="both").sum() + + with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"): psdf.A.resample("3Y", label="both").sum() with self.assertRaisesRegex( diff --git a/python/pyspark/pandas/tests/resample/test_frame.py b/python/pyspark/pandas/tests/resample/test_frame.py index c3b3423a60a28..3a11fda4a6f59 100644 --- a/python/pyspark/pandas/tests/resample/test_frame.py +++ b/python/pyspark/pandas/tests/resample/test_frame.py @@ -124,16 +124,19 @@ def _test_resample(self, pobj, psobj, rules, closed, label, func): getattr(p_resample, func)().sort_index(), getattr(ps_resample, func)().sort_index(), almost=True, + check_exact=False, ) def test_dataframe_resample(self): - self._test_resample(self.pdf1, self.psdf1, ["3Y", "9M", "17D"], None, None, "min") - self._test_resample(self.pdf2, self.psdf2, ["3A", "11M", "D"], None, "left", "max") - self._test_resample(self.pdf3, self.psdf3, ["20D", "1M"], None, "right", "sum") self._test_resample(self.pdf4, self.psdf4, ["11H", "21D"], "left", None, "mean") self._test_resample(self.pdf5, self.psdf5, ["55MIN", "2H", "D"], "left", "left", "std") self._test_resample(self.pdf6, self.psdf6, ["29S", "10MIN", "3H"], "left", "right", "var") + with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"): + self._test_resample(self.pdf2, self.psdf2, ["3A", "11M", "D"], None, "left", "max") + with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"): + self._test_resample(self.pdf1, self.psdf1, ["3Y", "9M", "17D"], None, None, "min") + class ResampleFrameTests(ResampleFrameMixin, PandasOnSparkTestCase, TestUtils): pass diff --git a/python/pyspark/pandas/tests/resample/test_missing.py b/python/pyspark/pandas/tests/resample/test_missing.py index 07dee20bad4f9..590df87cb4e52 100644 --- a/python/pyspark/pandas/tests/resample/test_missing.py +++ b/python/pyspark/pandas/tests/resample/test_missing.py @@ -63,8 +63,8 @@ def psdf1(self): return ps.from_pandas(self.pdf1) def test_missing(self): - pdf_r = self.psdf1.resample("3Y") - pser_r = self.psdf1.A.resample("3Y") + pdf_r = self.psdf1.resample("3D") + pser_r = self.psdf1.A.resample("3D") # DataFrameResampler functions missing_functions = inspect.getmembers( diff --git a/python/pyspark/pandas/tests/resample/test_series.py b/python/pyspark/pandas/tests/resample/test_series.py index 4828ca089e124..6b1c01c654967 100644 --- a/python/pyspark/pandas/tests/resample/test_series.py +++ b/python/pyspark/pandas/tests/resample/test_series.py @@ -127,13 +127,15 @@ def _test_resample(self, pobj, psobj, rules, closed, label, func): ) def test_series_resample(self): - self._test_resample(self.pdf1.A, self.psdf1.A, ["4Y"], "right", None, "min") self._test_resample(self.pdf2.A, self.psdf2.A, ["13M"], "right", "left", "max") self._test_resample(self.pdf3.A, self.psdf3.A, ["1001H"], "right", "right", "sum") self._test_resample(self.pdf4.A, self.psdf4.A, ["6D"], None, None, "mean") self._test_resample(self.pdf5.A, self.psdf5.A, ["47T"], "left", "left", "var") self._test_resample(self.pdf6.A, self.psdf6.A, ["111S"], "right", "right", "std") + with self.assertRaisesRegex(ValueError, "rule code YE-DEC is not supported"): + self._test_resample(self.pdf1.A, self.psdf1.A, ["4Y"], "right", None, "min") + class ResampleSeriesTests(ResampleSeriesMixin, PandasOnSparkTestCase, TestUtils): pass diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py index a78fedd3ece44..7024ef2a977c4 100644 --- a/python/pyspark/pandas/tests/test_namespace.py +++ b/python/pyspark/pandas/tests/test_namespace.py @@ -434,11 +434,7 @@ def test_concat_index_axis(self): "MultiIndex columns should have the same levels", lambda: ps.concat([psdf, psdf3]), ) - self.assertRaisesRegex( - ValueError, - "MultiIndex columns should have the same levels", - lambda: ps.concat([psdf3[("Y", "C")], psdf3]), - ) + self.assert_eq(ps.concat([psdf3[("Y", "C")], psdf3]), pd.concat([pdf3[("Y", "C")], pdf3])) pdf4 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5], "C": [10, 20, 30]}) psdf4 = ps.from_pandas(pdf4) diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 1cefc1ddb3d8c..3f5484f48764d 100755 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -2244,6 +2244,7 @@ def test_stat_freq_items(self): self.assert_eq( self.connect.read.table(self.tbl_name2).stat.freqItems(["col1", "col3"]).toPandas(), self.spark.read.table(self.tbl_name2).stat.freqItems(["col1", "col3"]).toPandas(), + check_exact=False, ) self.assert_eq( diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index 9adae0f6f755f..cb0d1bab7ffa9 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -604,10 +604,12 @@ def test_aggregation_functions(self): self.assert_eq( cdf.select(cfunc("b"), cfunc(cdf.c)).toPandas(), sdf.select(sfunc("b"), sfunc(sdf.c)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.groupBy("a").agg(cfunc("b"), cfunc(cdf.c)).toPandas(), sdf.groupBy("a").agg(sfunc("b"), sfunc(sdf.c)).toPandas(), + check_exact=False, ) for cfunc, sfunc in [ @@ -646,14 +648,17 @@ def test_aggregation_functions(self): self.assert_eq( cdf.select(CF.percentile_approx(cdf.b, [0.1, 0.9])).toPandas(), sdf.select(SF.percentile_approx(sdf.b, [0.1, 0.9])).toPandas(), + check_exact=False, ) self.assert_eq( cdf.groupBy("a").agg(CF.percentile_approx("b", 0.5)).toPandas(), sdf.groupBy("a").agg(SF.percentile_approx("b", 0.5)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.groupBy("a").agg(CF.percentile_approx(cdf.b, [0.1, 0.9])).toPandas(), sdf.groupBy("a").agg(SF.percentile_approx(sdf.b, [0.1, 0.9])).toPandas(), + check_exact=False, ) # test count_distinct @@ -1000,6 +1005,7 @@ def test_collection_functions(self): self.assert_eq( cdf.select(cfunc("a"), cfunc(cdf.b)).toPandas(), sdf.select(sfunc("a"), sfunc(sdf.b)).toPandas(), + check_exact=False, ) for cfunc, sfunc in [ @@ -1011,6 +1017,7 @@ def test_collection_functions(self): self.assert_eq( cdf.select(cfunc("b", cdf.c)).toPandas(), sdf.select(sfunc("b", sdf.c)).toPandas(), + check_exact=False, ) for cfunc, sfunc in [ @@ -1020,64 +1027,77 @@ def test_collection_functions(self): self.assert_eq( cdf.select(cfunc(cdf.a, "ab")).toPandas(), sdf.select(sfunc(sdf.a, "ab")).toPandas(), + check_exact=False, ) # test array self.assert_eq( cdf.select(CF.array(cdf.d, "e")).toPandas(), sdf.select(SF.array(sdf.d, "e")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array(cdf.d, "e", CF.lit(99))).toPandas(), sdf.select(SF.array(sdf.d, "e", SF.lit(99))).toPandas(), + check_exact=False, ) # test array_contains self.assert_eq( cdf.select(CF.array_contains(cdf.a, "ab")).toPandas(), sdf.select(SF.array_contains(sdf.a, "ab")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_contains(cdf.a, cdf.f)).toPandas(), sdf.select(SF.array_contains(sdf.a, sdf.f)).toPandas(), + check_exact=False, ) # test array_append self.assert_eq( cdf.select(CF.array_append(cdf.a, "xyz")).toPandas(), sdf.select(SF.array_append(sdf.a, "xyz")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_append(cdf.a, CF.lit("ab"))).toPandas(), sdf.select(SF.array_append(sdf.a, SF.lit("ab"))).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_append(cdf.a, cdf.f)).toPandas(), sdf.select(SF.array_append(sdf.a, sdf.f)).toPandas(), + check_exact=False, ) # test array_prepend self.assert_eq( cdf.select(CF.array_prepend(cdf.a, "xyz")).toPandas(), sdf.select(SF.array_prepend(sdf.a, "xyz")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_prepend(cdf.a, CF.lit("ab"))).toPandas(), sdf.select(SF.array_prepend(sdf.a, SF.lit("ab"))).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_prepend(cdf.a, cdf.f)).toPandas(), sdf.select(SF.array_prepend(sdf.a, sdf.f)).toPandas(), + check_exact=False, ) # test array_insert self.assert_eq( cdf.select(CF.array_insert(cdf.a, -5, "ab")).toPandas(), sdf.select(SF.array_insert(sdf.a, -5, "ab")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_insert(cdf.a, 3, cdf.f)).toPandas(), sdf.select(SF.array_insert(sdf.a, 3, sdf.f)).toPandas(), + check_exact=False, ) # test array_join @@ -1088,6 +1108,7 @@ def test_collection_functions(self): sdf.select( SF.array_join(sdf.a, ","), SF.array_join("b", ":"), SF.array_join("c", "~") ).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select( @@ -1100,20 +1121,24 @@ def test_collection_functions(self): SF.array_join("b", ":", ".null."), SF.array_join("c", "~", "NULL"), ).toPandas(), + check_exact=False, ) # test array_repeat self.assert_eq( cdf.select(CF.array_repeat(cdf.f, "d")).toPandas(), sdf.select(SF.array_repeat(sdf.f, "d")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_repeat("f", cdf.d)).toPandas(), sdf.select(SF.array_repeat("f", sdf.d)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.array_repeat("f", 3)).toPandas(), sdf.select(SF.array_repeat("f", 3)).toPandas(), + check_exact=False, ) # test arrays_zip @@ -1174,6 +1199,7 @@ def test_collection_functions(self): self.assert_eq( cdf.select(CF.slice(cdf.a, 1, 2), CF.slice("c", 2, 3)).toPandas(), sdf.select(SF.slice(sdf.a, 1, 2), SF.slice("c", 2, 3)).toPandas(), + check_exact=False, ) with self.assertRaises(PySparkTypeError) as pe: @@ -1198,6 +1224,7 @@ def test_collection_functions(self): self.assert_eq( cdf.select(CF.sort_array(cdf.a, True), CF.sort_array("c", False)).toPandas(), sdf.select(SF.sort_array(sdf.a, True), SF.sort_array("c", False)).toPandas(), + check_exact=False, ) # test struct @@ -1210,18 +1237,22 @@ def test_collection_functions(self): self.assert_eq( cdf.select(CF.sequence(CF.lit(1), CF.lit(5))).toPandas(), sdf.select(SF.sequence(SF.lit(1), SF.lit(5))).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.sequence(CF.lit(1), CF.lit(5), CF.lit(1))).toPandas(), sdf.select(SF.sequence(SF.lit(1), SF.lit(5), SF.lit(1))).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.sequence(cdf.d, "e")).toPandas(), sdf.select(SF.sequence(sdf.d, "e")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.sequence(cdf.d, "e", CF.lit(1))).toPandas(), sdf.select(SF.sequence(sdf.d, "e", SF.lit(1))).toPandas(), + check_exact=False, ) def test_map_collection_functions(self): @@ -1305,44 +1336,53 @@ def test_generator_functions(self): self.assert_eq( cdf.select(CF.explode(cdf.a), CF.col("b")).toPandas(), sdf.select(SF.explode(sdf.a), SF.col("b")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.explode("a"), "b").toPandas(), sdf.select(SF.explode("a"), "b").toPandas(), + check_exact=False, ) # test explode with maps self.assert_eq( cdf.select(CF.explode(cdf.d), CF.col("c")).toPandas(), sdf.select(SF.explode(sdf.d), SF.col("c")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.explode("d"), "c").toPandas(), sdf.select(SF.explode("d"), "c").toPandas(), + check_exact=False, ) # test explode_outer with arrays self.assert_eq( cdf.select(CF.explode_outer(cdf.a), CF.col("b")).toPandas(), sdf.select(SF.explode_outer(sdf.a), SF.col("b")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.explode_outer("a"), "b").toPandas(), sdf.select(SF.explode_outer("a"), "b").toPandas(), + check_exact=False, ) # test explode_outer with maps self.assert_eq( cdf.select(CF.explode_outer(cdf.d), CF.col("c")).toPandas(), sdf.select(SF.explode_outer(sdf.d), SF.col("c")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.explode_outer("d"), "c").toPandas(), sdf.select(SF.explode_outer("d"), "c").toPandas(), + check_exact=False, ) # test flatten self.assert_eq( cdf.select(CF.flatten(CF.array("b", cdf.c)), CF.col("b")).toPandas(), sdf.select(SF.flatten(SF.array("b", sdf.c)), SF.col("b")).toPandas(), + check_exact=False, ) # test inline @@ -1353,6 +1393,7 @@ def test_generator_functions(self): sdf.select(SF.expr("ARRAY(STRUCT(e, f), STRUCT(g AS e, f))").alias("X")) .select(SF.inline("X")) .toPandas(), + check_exact=False, ) # test inline_outer @@ -1363,44 +1404,53 @@ def test_generator_functions(self): sdf.select(SF.expr("ARRAY(STRUCT(e, f), STRUCT(g AS e, f))").alias("X")) .select(SF.inline_outer("X")) .toPandas(), + check_exact=False, ) # test posexplode with arrays self.assert_eq( cdf.select(CF.posexplode(cdf.a), CF.col("b")).toPandas(), sdf.select(SF.posexplode(sdf.a), SF.col("b")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.posexplode("a"), "b").toPandas(), sdf.select(SF.posexplode("a"), "b").toPandas(), + check_exact=False, ) # test posexplode with maps self.assert_eq( cdf.select(CF.posexplode(cdf.d), CF.col("c")).toPandas(), sdf.select(SF.posexplode(sdf.d), SF.col("c")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.posexplode("d"), "c").toPandas(), sdf.select(SF.posexplode("d"), "c").toPandas(), + check_exact=False, ) # test posexplode_outer with arrays self.assert_eq( cdf.select(CF.posexplode_outer(cdf.a), CF.col("b")).toPandas(), sdf.select(SF.posexplode_outer(sdf.a), SF.col("b")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.posexplode_outer("a"), "b").toPandas(), sdf.select(SF.posexplode_outer("a"), "b").toPandas(), + check_exact=False, ) # test posexplode_outer with maps self.assert_eq( cdf.select(CF.posexplode_outer(cdf.d), CF.col("c")).toPandas(), sdf.select(SF.posexplode_outer(sdf.d), SF.col("c")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.posexplode_outer("d"), "c").toPandas(), sdf.select(SF.posexplode_outer("d"), "c").toPandas(), + check_exact=False, ) def test_lambda_functions(self): @@ -1465,6 +1515,7 @@ def test_lambda_functions(self): self.assert_eq( cdf.select(CF.array_sort(cdf.b, lambda x, y: CF.abs(x) - CF.abs(y))).toPandas(), sdf.select(SF.array_sort(sdf.b, lambda x, y: SF.abs(x) - SF.abs(y))).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select( @@ -1483,26 +1534,31 @@ def test_lambda_functions(self): ), ) ).toPandas(), + check_exact=False, ) # test filter self.assert_eq( cdf.select(CF.filter(cdf.b, lambda x: x < 0)).toPandas(), sdf.select(SF.filter(sdf.b, lambda x: x < 0)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.filter("a", lambda x: ~CF.isnull(x))).toPandas(), sdf.select(SF.filter("a", lambda x: ~SF.isnull(x))).toPandas(), + check_exact=False, ) # test forall self.assert_eq( cdf.select(CF.filter(cdf.b, lambda x: x != 0)).toPandas(), sdf.select(SF.filter(sdf.b, lambda x: x != 0)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.filter("a", lambda x: ~CF.isnull(x))).toPandas(), sdf.select(SF.filter("a", lambda x: ~SF.isnull(x))).toPandas(), + check_exact=False, ) # test transform @@ -1510,30 +1566,36 @@ def test_lambda_functions(self): self.assert_eq( cdf.select(CF.transform(cdf.b, lambda x: x + 1)).toPandas(), sdf.select(SF.transform(sdf.b, lambda x: x + 1)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.transform("b", lambda x: x + 1)).toPandas(), sdf.select(SF.transform("b", lambda x: x + 1)).toPandas(), + check_exact=False, ) # transform with index self.assert_eq( cdf.select(CF.transform(cdf.b, lambda x, i: x + 1 - i)).toPandas(), sdf.select(SF.transform(sdf.b, lambda x, i: x + 1 - i)).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.transform("b", lambda x, i: x + 1 - i)).toPandas(), sdf.select(SF.transform("b", lambda x, i: x + 1 - i)).toPandas(), + check_exact=False, ) # test zip_with self.assert_eq( cdf.select(CF.zip_with(cdf.b, "c", lambda v1, v2: v1 - CF.abs(v2))).toPandas(), sdf.select(SF.zip_with(sdf.b, "c", lambda v1, v2: v1 - SF.abs(v2))).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.zip_with("b", cdf.c, lambda v1, v2: v1 - CF.abs(v2))).toPandas(), sdf.select(SF.zip_with("b", sdf.c, lambda v1, v2: v1 - SF.abs(v2))).toPandas(), + check_exact=False, ) # test map_filter @@ -2065,6 +2127,7 @@ def test_string_functions_multi_args(self): self.assert_eq( cdf.select(CF.split(cdf.b, "[bd]")).toPandas(), sdf.select(SF.split(sdf.b, "[bd]")).toPandas(), + check_exact=False, ) self.assert_eq( cdf.select(CF.regexp_extract(cdf.b, "(a+)(b)?(c)", 1)).toPandas(),