From 1bb463475928ae34fd4f361383dbdb46a6fb987e Mon Sep 17 00:00:00 2001 From: jacgoldsm Date: Sat, 7 Oct 2023 13:07:36 -0400 Subject: [PATCH] black --- osos/OsosSession.py | 50 +- osos/_implementations.py | 421 ++++------- osos/column.py | 38 +- osos/dataframe.py | 41 +- osos/exceptions.py | 4 +- osos/expr.py | 30 +- osos/functions.py | 1364 ++++++++------------------------- osos/indexer.py | 2 +- osos/types.py | 8 +- osos/utils.py | 34 +- osos/window.py | 4 +- setup.py | 5 +- tests/auto_t_est.py | 1553 ++++++++++++++++++++++---------------- tests/scrape.py | 55 +- tests/test_basic.py | 7 +- 15 files changed, 1502 insertions(+), 2114 deletions(-) diff --git a/osos/OsosSession.py b/osos/OsosSession.py index 436349e..ce0ca51 100644 --- a/osos/OsosSession.py +++ b/osos/OsosSession.py @@ -17,22 +17,27 @@ def createDataFrame(data, schema=None) -> DataFrame: schema has to be a list or a simple string""" ) schema = _parse_schema(schema) - if isinstance(data, (dict,tuple)): + if isinstance(data, (dict, tuple)): data = [data] - + if isinstance(data, pd.DataFrame): return DataFrame(data) if isinstance(data[0], dict): - warnings.warn("This is the Spark way of defining a DataFrame, not the Pandas way. Each " - "dict represents a row, not the whole data") + warnings.warn( + "This is the Spark way of defining a DataFrame, not the Pandas way. Each " + "dict represents a row, not the whole data" + ) if isinstance(data, list): # either a list of tuples or a list of dicts. # either way, handled by pd.DataFrame.from_records - return DataFrame(pd.DataFrame.from_records(data, columns = schema)) + return DataFrame(pd.DataFrame.from_records(data, columns=schema)) else: - raise AnalysisException("Data must be a dict, tuple, pandas DataFrame, list[dict], or list[tuple]") + raise AnalysisException( + "Data must be a dict, tuple, pandas DataFrame, list[dict], or list[tuple]" + ) + def _parse_schema(schema): if isinstance(schema, str): @@ -43,21 +48,27 @@ def _parse_schema(schema): schema = "".join(schema.split()).split(",") cols = [] for elem in schema: - cols.append(re.search('(.*):', elem).group(1)) + cols.append(re.search("(.*):", elem).group(1)) return cols - elif isinstance(schema,list): + elif isinstance(schema, list): return schema else: raise TypeError("schema must be str or list") -def range(start: int, end: Optional[int] = None, step: int = 1, numSlices: Optional[int] = None): +def range( + start: int, + end: Optional[int] = None, + step: int = 1, + numSlices: Optional[int] = None, +): import numpy as np + if end is None: end = start start = 0 - return DataFrame(pd.DataFrame({"id":np.arange(start,end,step)})) + return DataFrame(pd.DataFrame({"id": np.arange(start, end, step)})) class read: @@ -67,12 +78,13 @@ def csv(path, *args, **kwargs): def text(path, *args, **kwargs): return pd.read_csv(path, *args, **kwargs) + def _test(): - pd_data = pd.DataFrame({"a":[1,2,3], "b":[4,5,6]}) - tuple_data = (1,2) - dict_data = {"a":1, "b":2} - list_tuple_data = [(1,2), (3,4)] - list_dict_data = [{"a":1, "b":2}, {"a":3, "b":4}] + pd_data = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + tuple_data = (1, 2) + dict_data = {"a": 1, "b": 2} + list_tuple_data = [(1, 2), (3, 4)] + list_dict_data = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] str_schema_one = "ai bi" str_schema_two = "ai: int, bi:int" list_schema = ["ai", "bi"] @@ -85,11 +97,11 @@ def _test(): print() print(createDataFrame(pd_data)) print() - print(createDataFrame(pd_data,str_schema_one)) + print(createDataFrame(pd_data, str_schema_one)) print() print(createDataFrame(tuple_data)) print() - print(createDataFrame(tuple_data,str_schema_two)) + print(createDataFrame(tuple_data, str_schema_two)) print() print(createDataFrame(list_tuple_data, str_schema_one)) print() @@ -98,7 +110,5 @@ def _test(): print(createDataFrame(dict_data)) -if __name__ == '__main__': +if __name__ == "__main__": _test() - - diff --git a/osos/_implementations.py b/osos/_implementations.py index 69921ac..a67ccf5 100644 --- a/osos/_implementations.py +++ b/osos/_implementations.py @@ -4,7 +4,7 @@ import numpy as np from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.window.rolling import Rolling as RollingWindow -from typing import Union, Callable, Iterable, Any, overload, Optional,Dict +from typing import Union, Callable, Iterable, Any, overload, Optional, Dict from warnings import warn from .window import EmptyWindow, ConcreteWindowSpec @@ -127,7 +127,9 @@ def mode_func(series: pd.Series, *args, **kwargs): return pd.Series(series.mode(*args, **kwargs)) roll = _get_rolling_window(series, *args, **kwargs) try: - return roll.apply(pd.Series.mode).reset_index()[series.name].astype(series.dtype) + return ( + roll.apply(pd.Series.mode).reset_index()[series.name].astype(series.dtype) + ) except pd.errors.IntCastingNaNError: return roll.apply(pd.Series.mode).reset_index()[series.name] @@ -185,9 +187,11 @@ def count_func(series: pd.Series, *args, **kwargs): def lower_func(series: pd.Series, *args, **kwargs): return series.str.lower() + def upper_func(series: pd.Series, *args, **kwargs): return series.str.upper() + def avg_func(series: pd.Series, *args, **kwargs): if isinstance(kwargs["_over"], EmptyWindow): kwargs.pop("_over") @@ -212,7 +216,7 @@ def sum_distinct_func(series: pd.Series, *args, **kwargs): kwargs.pop("_over") return pd.Series(series.unique().sum(*args, **kwargs)) else: - raise Exception("`sum_distinct` isn't available as a Window function") + raise Exception("`sum_distinct` isn't available as a Window function") def product_func(series: pd.Series, *args, **kwargs): @@ -230,7 +234,6 @@ def acos_func(series: pd.Series, *args, **kwargs): return pd.Series(np.arccos(series)) - def acosh_func(series: pd.Series, *args, **kwargs): return pd.Series(np.arccosh(series)) @@ -279,12 +282,10 @@ def exp_func(series: pd.Series, *args, **kwargs): return pd.Series(np.exp(series)) - def expm1_func(series: pd.Series, *args, **kwargs): return pd.Series(np.expm1(series)) - def floor_func(series: pd.Series, *args, **kwargs): return pd.Series(np.floor(series)) @@ -309,7 +310,6 @@ def signum_func(series: pd.Series, *args, **kwargs): return pd.Series(np.sign(series)) - def sin_func(series: pd.Series, *args, **kwargs): return pd.Series(np.sin(series)) @@ -391,7 +391,11 @@ def kurtosis_func(series: pd.Series, *args, **kwargs): return pd.Series(series.kurtosis(*args, **kwargs)) roll = _get_rolling_window(series, *args, **kwargs) try: - return roll.apply(pd.Series.kurtosis).reset_index()[series.name].astype(series.dtype) + return ( + roll.apply(pd.Series.kurtosis) + .reset_index()[series.name] + .astype(series.dtype) + ) except pd.errors.IntCastingNaNError: return roll.apply(pd.Series.kurtosis).reset_index()[series.name] @@ -405,7 +409,7 @@ def hypot_func(series: pd.Series, *args, **kwargs): def pow_func(col1: pd.Series, col2: pd.Series, *args, **kwargs): - return col1 ** col2 + return col1**col2 def pmod_func(col1: pd.Series, col2: pd.Series, *args, **kwargs): @@ -448,14 +452,15 @@ def coalesce_func(*cols): raise AnalysisException("Need at least 2 cols to coalesce.") out = cols[0] for i in range(len(cols) - 1): - out = np.where(~pd.isnull(out), out, cols[i+1]) + out = np.where(~pd.isnull(out), out, cols[i + 1]) return out -def lag_func(series: pd.Series, *args, **kwargs): + +def lag_func(series: pd.Series, *args, **kwargs): if isinstance(kwargs["_over"], EmptyWindow): raise AnalysisException("`lag_func` cannot be an Aggregate function") - roll = _get_rolling_window(series, _convert_to_rolling = False, *args, **kwargs) + roll = _get_rolling_window(series, _convert_to_rolling=False, *args, **kwargs) if len(args) == 0: offset, default = 1, None elif len(args) == 1: @@ -463,109 +468,113 @@ def lag_func(series: pd.Series, *args, **kwargs): elif len(args) == 2: offset, default = args[0], args[1] try: - return roll.shift(periods = offset, fill_value = default).sort_index().reset_index()[series.name].astype(series.dtype) + return ( + roll.shift(periods=offset, fill_value=default) + .sort_index() + .reset_index()[series.name] + .astype(series.dtype) + ) except pd.errors.IntCastingNaNError: - return roll.shift(periods = offset, fill_value = default).sort_index().reset_index()[series.name] + return ( + roll.shift(periods=offset, fill_value=default) + .sort_index() + .reset_index()[series.name] + ) def lead_func(series: pd.Series, *args, **kwargs): if isinstance(kwargs["_over"], EmptyWindow): raise AnalysisException("`lead_func` cannot be an Aggregate function") - roll = _get_rolling_window(series, _convert_to_rolling = False, *args, **kwargs) + roll = _get_rolling_window(series, _convert_to_rolling=False, *args, **kwargs) if len(args) == 0: offset, default = -1, None elif len(args) == 1: - offset, default = -1*args[0], None + offset, default = -1 * args[0], None elif len(args) == 2: - offset, default = -1*args[0], args[1] + offset, default = -1 * args[0], args[1] try: - return roll.shift(periods = offset, fill_value = default).sort_index().reset_index()[series.name].astype(series.dtype) + return ( + roll.shift(periods=offset, fill_value=default) + .sort_index() + .reset_index()[series.name] + .astype(series.dtype) + ) except pd.errors.IntCastingNaNError: - return roll.shift(periods = offset, fill_value = default).sort_index().reset_index()[series.name] + return ( + roll.shift(periods=offset, fill_value=default) + .sort_index() + .reset_index()[series.name] + ) def corr_func(col1: pd.Series, col2: pd.Series): - - raise NotImplementedError + raise NotImplementedError def covar_pop_func(col1: pd.Series, col2: pd.Series): - - raise NotImplementedError + raise NotImplementedError def covar_samp_func(col1: pd.Series, col2: pd.Series): - - raise NotImplementedError + raise NotImplementedError def countDistinct_func(col: pd.Series, *cols: pd.Series): - - return count_distinct_func(col, *cols) + return count_distinct_func(col, *cols) def count_distinct_func(col: pd.Series, *cols: pd.Series): - - raise NotImplemented("") + raise NotImplemented("") def first_func(col: pd.Series, ignorenulls: bool = False): - - raise NotImplementedError + raise NotImplementedError def grouping_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def grouping_id(*cols: pd.Series): - - raise NotImplementedError + raise NotImplementedError def input_file_name(): - - raise NotImplementedError + raise NotImplementedError def isnan_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def isnull_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def last_func(col: pd.Series, ignorenulls: bool = False): - - raise NotImplementedError + raise NotImplementedError def monotonically_increasing_id(): - - raise NotImplementedError + raise NotImplementedError def nanvl_func(col1: pd.Series, col2: pd.Series): - - raise NotImplementedError + raise NotImplementedError def percentile_approx_func( @@ -573,102 +582,87 @@ def percentile_approx_func( percentage: pd.Series | float | list[float] | tuple[float], accuracy: Union[pd.Series, float] = 10000, ): - if isinstance(percentage, pd.Series): - + percentage = pd.Series(percentage) else: - + percentage = pd.Series(percentage) accuracy = ( - pd.Series(accuracy) - if isinstance(accuracy, pd.Series) - else pd.Series(accuracy) + pd.Series(accuracy) if isinstance(accuracy, pd.Series) else pd.Series(accuracy) ) raise NotImplementedError - def rand_func(seed: Optional[int] = None): - + if seed is not None: raise NotImplementedError else: raise NotImplementedError - def randn_func(seed: Optional[int] = None): - + if seed is not None: raise NotImplementedError else: raise NotImplementedError - def round_func(col: pd.Series, scale: int = 0): - - raise NotImplementedError + raise NotImplementedError def bround_func(col: pd.Series, scale: int = 0): - - raise NotImplementedError + raise NotImplementedError def shiftLeft_func(col: pd.Series, numBits: int): - + warn("Deprecated in 3.2, use shiftleft instead.", FutureWarning) return shiftleft_func(col, numBits) - def shiftleft_func(col: pd.Series, numBits: int): - - raise NotImplementedError + raise NotImplementedError def shiftRight_func(col: pd.Series, numBits: int): - + warn("Deprecated in 3.2, use shiftright instead.", FutureWarning) return shiftright_func(col, numBits) - def shiftright_func(col: pd.Series, numBits: int): - - raise NotImplementedError + raise NotImplementedError def shiftRightUnsigned_func(col: pd.Series, numBits: int): - + warn("Deprecated in 3.2, use shiftrightunsigned instead.", FutureWarning) return shiftrightunsigned_func(col, numBits) - def shiftrightunsigned_func(col: pd.Series, numBits: int): - - raise NotImplementedError + raise NotImplementedError def spark_partition_id(): - - raise NotImplementedError + raise NotImplementedError def expr(str: str): - + raise NotImplementedError @@ -678,28 +672,24 @@ def struct_func(*cols: pd.Series): @overload -def struct_func( - __cols: Union[list[pd.Series], tuple[pd.Series, ...]] -): +def struct_func(__cols: Union[list[pd.Series], tuple[pd.Series, ...]]): ... - def struct_func( *cols: Union[ pd.Series, list[pd.Series] | tuple[pd.Series, ...], ] ): - + if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] + cols = cols[0] raise NotImplementedError - def greatest(*cols: pd.Series): - + if len(cols) < 2: raise OsosValueError( error_class="WRONG_NUM_pd.SeriesS", @@ -708,9 +698,8 @@ def greatest(*cols: pd.Series): raise NotImplementedError - def least_func(*cols: pd.Series): - + if len(cols) < 2: raise OsosValueError( error_class="WRONG_NUM_pd.SeriesS", @@ -719,10 +708,8 @@ def least_func(*cols: pd.Series): raise NotImplementedError - def when_func(condition: pd.Series, value: Any): - - + if not isinstance(condition, pd.Series): raise OsosTypeError( error_class="NOT_pd.Series", @@ -736,7 +723,7 @@ def when_func(condition: pd.Series, value: Any): raise NotImplementedError -@overload +@overload def log_func(arg1: pd.Series): ... @@ -746,164 +733,129 @@ def log_func(arg1: float, arg2: pd.Series): ... +def log_func(arg1: pd.Series | float, arg2: Optional[pd.Series] = None): -def log_func( - arg1: pd.Series | float, arg2: Optional[pd.Series] = None -): - if arg2 is None: raise NotImplementedError else: raise NotImplementedError - def log2_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def conv_func(col: pd.Series, fromBase: int, toBase: int): - - raise NotImplementedError + raise NotImplementedError def factorial_func(col: pd.Series): - + raise NotImplementedError +def nth_value_func(col: pd.Series, offset: int, ignoreNulls: Optional[bool] = False): -def nth_value_func( - col: pd.Series, offset: int, ignoreNulls: Optional[bool] = False -): - raise NotImplementedError - def ntile_func(n: int): - + raise NotImplementedError def date_format(date: pd.Series, format: str): - - raise NotImplementedError + raise NotImplementedError def year_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def quarter_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def month_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def dayofweek_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def dayofmonth_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def dayofyear_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def hour_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def minute_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def second_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def weekofyear_func(col: pd.Series): - + raise NotImplementedError +def make_date_func(year: pd.Series, month: pd.Series, day: pd.Series): -def make_date_func( - year: pd.Series, month: pd.Series, day: pd.Series -): - raise NotImplementedError - def date_add_func(start: pd.Series, days: Union[pd.Series, int]): - + days = pd.Series(days) if isinstance(days, int) else days raise NotImplementedError - def date_sub_func(start: pd.Series, days: Union[pd.Series, int]): - + days = pd.Series(days) if isinstance(days, int) else days raise NotImplementedError - def datediff_func(end: pd.Series, start: pd.Series): - + raise NotImplementedError +def add_months_func(start: pd.Series, months: Union[pd.Series, int]): -def add_months_func( - start: pd.Series, months: Union[pd.Series, int] -): - months = pd.Series(months) if isinstance(months, int) else months raise NotImplementedError +def months_between_func(date1: pd.Series, date2: pd.Series, roundOff: bool = True): -def months_between_func( - date1: pd.Series, date2: pd.Series, roundOff: bool = True -): - - raise NotImplemented( - "months_between", pd.Series(date1), pd.Series(date2), roundOff - ) - + raise NotImplemented("months_between", pd.Series(date1), pd.Series(date2), roundOff) def to_date_func(col: pd.Series, format: Optional[str] = None): - + if format is None: raise NotImplementedError else: @@ -920,44 +872,36 @@ def to_timestamp_func(col: pd.Series, format: str): ... - def to_timestamp_func(col: pd.Series, format: Optional[str] = None): - + if format is None: raise NotImplementedError else: raise NotImplementedError - def trunc_func(date: pd.Series, format: str): - - raise NotImplementedError + raise NotImplementedError def date_trunc_func(format: str, timestamp: pd.Series): - - raise NotImplementedError + raise NotImplementedError def next_day_func(date: pd.Series, dayOfWeek: str): - - raise NotImplementedError + raise NotImplementedError def last_day_func(date: pd.Series): - + raise NotImplementedError +def from_unixtime_func(timestamp: pd.Series, format: str = "yyyy-MM-dd HH:mm:ss"): -def from_unixtime_func( - timestamp: pd.Series, format: str = "yyyy-MM-dd HH:mm:ss" -): - raise NotImplementedError @@ -971,40 +915,34 @@ def unix_timestamp_func(): ... - def unix_timestamp_func( timestamp: Optional[pd.Series] = None, format: str = "yyyy-MM-dd HH:mm:ss" ): - + if timestamp is None: raise NotImplementedError raise NotImplementedError - def from_utc_timestamp_func(timestamp: pd.Series, tz: pd.Series): - + if isinstance(tz, pd.Series): tz = pd.Series(tz) raise NotImplementedError - def to_utc_timestamp_func(timestamp: pd.Series, tz: pd.Series): - + if isinstance(tz, pd.Series): tz = pd.Series(tz) raise NotImplementedError - def timestamp_seconds_func(col: pd.Series): - raise NotImplementedError - def window_func( series: pd.Series, windowDuration: str, @@ -1014,55 +952,44 @@ def window_func( raise NotImplementedError - def window_time_func( window: pd.Series, ): raise NotImplementedError - - def crc32_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def md5_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def sha1_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def sha2_func(col: pd.Series, numBits: int): - - raise NotImplementedError + raise NotImplementedError def hash_func(*cols: pd.Series): - - raise NotImplementedError + raise NotImplementedError def xxhash64_func(*cols: pd.Series): - + raise NotImplementedError +def assert_true_func(col: pd.Series, errMsg: Optional[Union[pd.Series, str]] = None): -def assert_true_func( - col: pd.Series, errMsg: Optional[Union[pd.Series, str]] = None -): - if errMsg is None: raise NotImplementedError if not isinstance(errMsg, (str, pd.Series)): @@ -1074,17 +1001,12 @@ def assert_true_func( }, ) - errMsg = ( - "foo" - if isinstance(errMsg, str) - else pd.Series(errMsg) - ) + errMsg = "foo" if isinstance(errMsg, str) else pd.Series(errMsg) raise NotImplementedError - def raise_error_func(errMsg: Union[pd.Series, str]): - + if not isinstance(errMsg, (str, pd.Series)): raise OsosTypeError( error_class="NOT_pd.Series_OR_STR", @@ -1098,77 +1020,64 @@ def raise_error_func(errMsg: Union[pd.Series, str]): raise NotImplementedError - def ascii_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def base64_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def unbase64_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def ltrim_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def rtrim_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def trim_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def concat_ws(sep: str, *cols: pd.Series): - - raise NotImplementedError + raise NotImplementedError def decode_func(col: pd.Series, charset: str): - - raise NotImplementedError + raise NotImplementedError def encode_func(col: pd.Series, charset: str): - - raise NotImplementedError + raise NotImplementedError def format_number_func(col: pd.Series, d: int): - - raise NotImplementedError + raise NotImplementedError def format_string_func(format: str, *cols: pd.Series): - - raise NotImplementedError + raise NotImplementedError def instr_func(str: pd.Series, substr: str): - - raise NotImplementedError + raise NotImplementedError def overlay_func( @@ -1177,7 +1086,7 @@ def overlay_func( pos: Union[pd.Series, int], len: Union[pd.Series, int] = -1, ): - + if not isinstance(pos, (int, str, pd.Series)): raise OsosTypeError( error_class="NOT_pd.Series_OR_INT_OR_STR", @@ -1195,13 +1104,12 @@ def overlay_func( raise NotImplementedError - def sentences_func( string: pd.Series, language: Optional[pd.Series] = None, country: Optional[pd.Series] = None, ): - + if language is None: language = pd.Series("") if country is None: @@ -1210,59 +1118,49 @@ def sentences_func( raise NotImplementedError - def substring_func(str: pd.Series, pos: int, len: int): - - raise NotImplementedError + raise NotImplementedError def substring_index_func(str: pd.Series, delim: str, count: int): - - raise NotImplementedError + raise NotImplementedError def levenshtein_Func(left: pd.Series, right: pd.Series): - - raise NotImplementedError + raise NotImplementedError def locate_func(substr: str, str: pd.Series, pos: int = 1): - - raise NotImplementedError + raise NotImplementedError def lpad_func(col: pd.Series, len: int, pad: str): - - raise NotImplementedError + raise NotImplementedError def rpad_func(col: pd.Series, len: int, pad: str): - - raise NotImplementedError + raise NotImplementedError def repeat_func(col: pd.Series, n: int): - - raise NotImplementedError + raise NotImplementedError def split_func(str: pd.Series, pattern: str, limit: int = -1): - - raise NotImplementedError + raise NotImplementedError def regexp_extract_func(str: pd.Series, pattern: str, idx: int): - - raise NotImplementedError + raise NotImplementedError def regexp_replace_func( @@ -1270,7 +1168,7 @@ def regexp_replace_func( pattern: Union[str, pd.Series], replacement: Union[str, pd.Series], ): - + if isinstance(pattern, str): pattern_col = pd.Series(pattern) else: @@ -1282,55 +1180,46 @@ def regexp_replace_func( raise NotImplementedError - def initcap_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def soundex_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def bin_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def hex_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def unhex_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def length_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def octet_length_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def bit_length_func(col: pd.Series): - - raise NotImplementedError + raise NotImplementedError def translate_func(srcCol: pd.Series, matching: str, replace: str): - - raise NotImplementedError \ No newline at end of file + + raise NotImplementedError diff --git a/osos/column.py b/osos/column.py index b48daa0..545c1d7 100644 --- a/osos/column.py +++ b/osos/column.py @@ -1,7 +1,7 @@ from __future__ import annotations from numbers import Number -from typing import Union, Iterable, TYPE_CHECKING,List +from typing import Union, Iterable, TYPE_CHECKING, List import operator from copy import deepcopy import numpy as np @@ -14,9 +14,11 @@ NumOrCol = Union[Number, "AbstractColOrLit"] + def rename_series(series, newname: str, _over=None) -> pd.Series: return series.rename(newname) + class Node: def __init__(self, name, args): self._name = name @@ -95,6 +97,7 @@ def __ne__(self, other): def alias(self, newname): return Func(rename_series, self, NameString(newname, ())) + class When(Node): """ The `_args` in a `When` Node will have one of the following two structures: @@ -103,14 +106,18 @@ class When(Node): In either case, even numbered elements are conditions, and odd-numbered elements are values. - In the resolved form, `args` is a list of columns, where even-numbered elements are + In the resolved form, `args` is a list of columns, where even-numbered elements are boolean conditions, and odd-numbered elements are values. """ + def __init__(self, condition, value): self._name = self._when_func - if not isinstance(value,Node): + if not isinstance(value, Node): value = AbstractLit(value) - self._args = [condition,value,] + self._args = [ + condition, + value, + ] @staticmethod def _when_func(*args: pd.Series, **kwargs): @@ -120,8 +127,8 @@ def _when_func(*args: pd.Series, **kwargs): else: null_type = None col = np.full(len(args[0].index), null_type) - conditions = [args[i] for i in range(len(args)) if i % 2 == 0] # even numbers - values = [args[i] for i in range(len(args)) if i % 2 == 1] # odd numbers + conditions = [args[i] for i in range(len(args)) if i % 2 == 0] # even numbers + values = [args[i] for i in range(len(args)) if i % 2 == 1] # odd numbers # `i` will loop over all the conditions in reverse order, # so starting with `True` if `otherwise` exists @@ -130,21 +137,24 @@ def _when_func(*args: pd.Series, **kwargs): # make some effort to cast back to int if possible # i.e. if all the replacement values were ints and there are no missings - if all(np.issubdtype(val,np.integer) for val in values) and not np.isnan(col).any(): + if ( + all(np.issubdtype(val, np.integer) for val in values) + and not np.isnan(col).any() + ): col = col.astype(int) - + return pd.Series(col) - def when(self, condition,value): - if not isinstance(value,Node): + def when(self, condition, value): + if not isinstance(value, Node): value = AbstractLit(value) - self._args += [condition,value] + self._args += [condition, value] return self - def otherwise(self,value): - if not isinstance(value,Node): + def otherwise(self, value): + if not isinstance(value, Node): value = AbstractLit(value) - self._args += [SimpleContainer(True, []),value] + self._args += [SimpleContainer(True, []), value] return self diff --git a/osos/dataframe.py b/osos/dataframe.py index 93fbc53..a5ecaf9 100644 --- a/osos/dataframe.py +++ b/osos/dataframe.py @@ -13,7 +13,7 @@ from copy import deepcopy, copy import numpy as np -from typing import Iterable, Union, Optional,cast +from typing import Iterable, Union, Optional, cast import pandas as pd @@ -23,7 +23,8 @@ from .window import EmptyWindow from ._forwardrefs import forward_dict -NodeOrStr = Union[Node,str] +NodeOrStr = Union[Node, str] + class DataFrame: @staticmethod @@ -41,14 +42,13 @@ def __getitem__(self, item): elif isinstance(item, (list, tuple)): return self.select(*item) elif isinstance(item, int): - return AbstractCol(self._data.iloc[:,item].name) + return AbstractCol(self._data.iloc[:, item].name) def __getattr__(self, attr): if attr not in self._data.columns: raise Exception("Attribute not found") return AbstractCol(attr) - @property def true_index(self): return np.arange(len(self._data.index)) @@ -95,12 +95,14 @@ def _eval_recursive(self, expr: Union[Node, ForwardRef]) -> pd.Series: if isinstance(node, Func): res = node._name( *list(self._eval_recursive(n) for n in node._args), - _over=self._eval_recursive(node._over) + _over=self._eval_recursive(node._over), ) else: - res: pd.Series = node._name(*list(self._eval_recursive(n) for n in node._args)) + res: pd.Series = node._name( + *list(self._eval_recursive(n) for n in node._args) + ) - return res # type: ignore + return res # type: ignore def withColumn(self, name: str, expr: Node) -> "DataFrame": @@ -134,26 +136,33 @@ def filter(self, *exprs: NodeOrStr) -> "DataFrame": boolean_mask.dtype == np.bool8 ), "`filter` expressions must return boolean results" newdf = DataFrame(newdf._data.loc[boolean_mask]) - newdf._data.index = np.arange(len(newdf._data.index)) #type: ignore + newdf._data.index = np.arange(len(newdf._data.index)) # type: ignore return newdf - - def show(self, n: int = 0, vertical: bool = False, truncate: Union[bool, int] = False): - if n == 0: + def show( + self, n: int = 0, vertical: bool = False, truncate: Union[bool, int] = False + ): + if n == 0: n = len(self._data.index) if isinstance(truncate, int) and truncate > 1: l = truncate else: l = 20 nrowsdf = self._data.iloc[0:n] - columns = nrowsdf.columns + columns = nrowsdf.columns separator = "+-" + "-+-".join(["-" * len(col) for col in columns]) + "-+" result = separator + "\n" result += "| " + " | ".join([f"{col:>{len(col)}}" for col in columns]) + " |\n" result += separator + "\n" for _, row in nrowsdf.iterrows(): - row_str = "| " + " | ".join([f"{str(val)[0:l]:>{len(col)}}" for val, col in zip(row, columns)]) + " |\n" + row_str = ( + "| " + + " | ".join( + [f"{str(val)[0:l]:>{len(col)}}" for val, col in zip(row, columns)] + ) + + " |\n" + ) result += row_str result += separator print(result) @@ -190,7 +199,7 @@ def agg(self, *exprs: NodeOrStr) -> "DataFrame": out.append(ser) newdf = pd.concat(out, axis=1) - newdf.index = np.arange(len(newdf.index)) # type: ignore + newdf.index = np.arange(len(newdf.index)) # type: ignore if isinstance(self, GroupedData): newdf = pd.concat([self._uniques, newdf], axis=1) @@ -280,7 +289,7 @@ def join(self, other: "DataFrame", by: Union[str, list], how: str): raise Exception("Unknown join type") return DataFrame(out) - def _resolve_leaf(self, node: NodeOrStr) -> Union[pd.Series,Node]: + def _resolve_leaf(self, node: NodeOrStr) -> Union[pd.Series, Node]: if isinstance(node, AbstractCol): return self._data[node._name] elif isinstance(node, AbstractLit): @@ -310,7 +319,7 @@ def __init__(self, data=None, cols=None): .drop("__index__", axis=1) .reset_index() ) - uniques.index = np.arange(len(uniques.index)) #type: ignore + uniques.index = np.arange(len(uniques.index)) # type: ignore self._uniques = uniques @property diff --git a/osos/exceptions.py b/osos/exceptions.py index e631fd1..e7e6c67 100644 --- a/osos/exceptions.py +++ b/osos/exceptions.py @@ -1,5 +1,6 @@ from __future__ import annotations + class AnalysisException(Exception): pass @@ -13,6 +14,7 @@ class OsosTypeError(Exception): def __init__(self, error_class=None, message_parameters=None): pass + class OsosNotImplementedError: def __init__(self, error_class=None, message_parameters=None): - pass \ No newline at end of file + pass diff --git a/osos/expr.py b/osos/expr.py index b50915e..a3b8b00 100644 --- a/osos/expr.py +++ b/osos/expr.py @@ -1,9 +1,8 @@ from sqlglot import parse_one, Expression -from sqlglot.expressions import Identifier,Literal,Column,Add,Sum,Select +from sqlglot.expressions import Identifier, Literal, Column, Add, Sum, Select from collections.abc import Iterable -from osos.column import Node,AbstractCol,AbstractLit,Func,BinaryOp,UnaryOp - +from osos.column import Node, AbstractCol, AbstractLit, Func, BinaryOp, UnaryOp def print_tree(tree: Expression): @@ -12,30 +11,33 @@ def print_tree(tree: Expression): print(node.args) print("--" * node.depth, node) + def make_tree(tree: Expression) -> Node: for elem in tree.walk(): node = elem[0] - if isinstance(node, (Identifier,Column)): + if isinstance(node, (Identifier, Column)): out = AbstractCol(node.name) elif isinstance(node, Literal): out = AbstractLit(node.name) def main(): - stmts = ["SELECT SUM(x+3)", "SELECT CASE WHEN SUM(avg(x)+2) > 3 THEN 1 ELSE 0 END",] - #stmts = ["select fo,bar from baz"] + stmts = [ + "SELECT SUM(x+3)", + "SELECT CASE WHEN SUM(avg(x)+2) > 3 THEN 1 ELSE 0 END", + ] + # stmts = ["select fo,bar from baz"] for stmt in stmts[:1]: - s=parse_one(stmt) + s = parse_one(stmt) print_tree(s) - #print(select_stmt.parse_string(stmt)) - #print(repr(parse_one(stmt))) + # print(select_stmt.parse_string(stmt)) + # print(repr(parse_one(stmt))) - #print(len(parse_one(stmt))) + # print(len(parse_one(stmt))) ... - #print(select_stmt.parse_string(stmt)[1:][0][0]) - #print(stmt, "\n\t", print_tree(walk_tree(select_stmt.parse_string(stmt)))) + # print(select_stmt.parse_string(stmt)[1:][0][0]) + # print(stmt, "\n\t", print_tree(walk_tree(select_stmt.parse_string(stmt)))) -if __name__ == '__main__': +if __name__ == "__main__": main() - diff --git a/osos/functions.py b/osos/functions.py index 08135eb..86b304c 100644 --- a/osos/functions.py +++ b/osos/functions.py @@ -19,7 +19,7 @@ from .dataframe import DataFrame -from typing import Any, Optional, Union, List, Tuple, overload, Callable,Tuple +from typing import Any, Optional, Union, List, Tuple, overload, Callable, Tuple from warnings import warn # all the public names in _implementations end in "_func" @@ -45,7 +45,7 @@ def lit(col: Any) -> Func: the value to make it as a PySpark literal. If a AbstractCol is passed, it returns the AbstractCol as is. - + Since 3.4.0, it supports the list type. Returns @@ -147,16 +147,10 @@ def asc(col: "AbstractColOrName") -> Func: return asc_func(col) - def desc(col: "AbstractColOrName") -> Func: """ Returns a sort expression based on the descending order of the given AbstractCol name. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -182,21 +176,14 @@ def desc(col: "AbstractColOrName") -> Func: | 0| +---+ """ - if isinstance(col, str): - col = AbstractCol(col) + col = process_one_col(col) return desc_func(col) - def sqrt(col: "AbstractColOrName") -> Func: """ Computes the square root of the specified float value. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -217,21 +204,14 @@ def sqrt(col: "AbstractColOrName") -> Func: | 2.0| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) + col = process_one_col(col) return Func(sqrt_func, col) - def abs(col: "AbstractColOrName") -> Func: """ Computes the absolute value. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -252,21 +232,14 @@ def abs(col: "AbstractColOrName") -> Func: | 1| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) + col = process_one_col(col) return Func(abs_func, col) - def mode(col: "AbstractColOrName") -> Func: """ Returns the most frequent value in a group. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -292,20 +265,14 @@ def mode(col: "AbstractColOrName") -> Func: |dotNET| 2012| +------+----------+ """ - if isinstance(col, str): - col = AbstractCol(col) + col = process_one_col(col) return Func(mode_func, col) - def max(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the maximum value of the expression in a group. - - - - Parameters ---------- @@ -327,21 +294,14 @@ def max(col: "AbstractColOrName") -> Func: | 9| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(max_func, col) - def min(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the minimum value of the expression in a group. - - - - Parameters ---------- @@ -363,22 +323,14 @@ def min(col: "AbstractColOrName") -> Func: | 0| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(min_func, col) - def max_by(col: "AbstractColOrName", ord: "AbstractColOrName") -> Func: """ Returns the value associated with the maximum value of ord. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -405,22 +357,15 @@ def max_by(col: "AbstractColOrName", ord: "AbstractColOrName") -> Func: |dotNET| 2013| +------+----------------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) + ord = process_one_col(ord) return Func(max_by_func, col, ord) - def min_by(col: "AbstractColOrName", ord: "AbstractColOrName") -> Func: """ Returns the value associated with the minimum value of ord. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -447,22 +392,14 @@ def min_by(col: "AbstractColOrName", ord: "AbstractColOrName") -> Func: |dotNET| 2012| +------+----------------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(min_by_func, col, ord) - def count(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the number of items in a group. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -485,22 +422,14 @@ def count(col: "AbstractColOrName") -> Func: | 4| 3| +--------+----------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(count_func, col) - def sum(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the sum of all values in the expression. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -521,22 +450,14 @@ def sum(col: "AbstractColOrName") -> Func: | 45| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(sum_func, col) - def avg(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the average of the values in a group. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -557,23 +478,15 @@ def avg(col: "AbstractColOrName") -> Func: | 4.5| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(avg_func, col) - def mean(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the average of the values in a group. An alias of :func:`avg`. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -594,21 +507,14 @@ def mean(col: "AbstractColOrName") -> Func: | 4.5| +-------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(avg_func, col) - def median(col: "AbstractColOrName") -> Func: """ Returns the median of the values in a group. - - - - Parameters ---------- @@ -635,42 +541,27 @@ def median(col: "AbstractColOrName") -> Func: |dotNET| 10000.0| +------+----------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(median_func, col) - def sumDistinct(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the sum of distinct values in the expression. - - - - .. deprecated:: 3.2.0 Use :func:`sum_distinct` instead. """ - warn("Deprecated in 3.2, use sum_distinct instead.", FutureWarning) - if isinstance(col, str): - col = AbstractCol(col) - + warn("Deprecated, use sum_distinct instead.", FutureWarning) + col = process_one_col(col) return Func(sum_distinct_func, col) - def sum_distinct(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the sum of distinct values in the expression. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -691,22 +582,14 @@ def sum_distinct(col: "AbstractColOrName") -> Func: | 3| +---------------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(sum_distinct_func, col) - def product(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the product of the values in a group. - - - - - Parameters ---------- col : str, :class:`AbstractCol` @@ -730,22 +613,14 @@ def product(col: "AbstractColOrName") -> Func: | 2| 80.0| +----+-------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(product_func, col) - def acos(col: "AbstractColOrName") -> Func: """ Computes inverse cosine of the input AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -767,21 +642,18 @@ def acos(col: "AbstractColOrName") -> Func: | NaN| +--------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(acos_func, col) - def acosh(col: "AbstractColOrName") -> Func: """ Computes inverse hyperbolic cosine of the input AbstractCol. - - - + + + Parameters ---------- @@ -804,22 +676,14 @@ def acosh(col: "AbstractColOrName") -> Func: | 0.0| +---------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(acosh_func, col) - def asin(col: "AbstractColOrName") -> Func: """ Computes inverse sine of the input AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -841,22 +705,14 @@ def asin(col: "AbstractColOrName") -> Func: | NaN| +--------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(asin_func, col) - def asinh(col: "AbstractColOrName") -> Func: """ Computes inverse hyperbolic sine of the input AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -877,22 +733,14 @@ def asinh(col: "AbstractColOrName") -> Func: | 0.0| +---------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(asinh_func, col) - def atan(col: "AbstractColOrName") -> Func: """ Compute inverse tangent of the input AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -913,22 +761,14 @@ def atan(col: "AbstractColOrName") -> Func: | 0.0| +--------+ """ - if isinstance(col, str): - col = AbstractCol(col) - + col = process_one_col(col) return Func(atan_func, col) - def atanh(col: "AbstractColOrName") -> Func: """ Computes inverse hyperbolic tangent of the input AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -950,21 +790,20 @@ def atanh(col: "AbstractColOrName") -> Func: | NaN| +--------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(atanh_func, col) + col = process_one_col(col) + return Func(atanh_func, col) def cbrt(col: "AbstractColOrName") -> Func: """ Computes the cube-root of the given value. - - - + + + Parameters ---------- @@ -986,21 +825,20 @@ def cbrt(col: "AbstractColOrName") -> Func: | 3.0| +--------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(cbrt_func, col) + col = process_one_col(col) + return Func(cbrt_func, col) def ceil(col: "AbstractColOrName") -> Func: """ Computes the ceiling of the given value. - - - + + + Parameters ---------- @@ -1022,21 +860,20 @@ def ceil(col: "AbstractColOrName") -> Func: | 0| +----------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(ceil_func, col) + col = process_one_col(col) + return Func(ceil_func, col) def cos(col: "AbstractColOrName") -> Func: """ Computes cosine of the input AbstractCol. - - - + + + Parameters ---------- @@ -1055,21 +892,20 @@ def cos(col: "AbstractColOrName") -> Func: >>> df.select(cos(lit(math.pi))).first() Row(COS(3.14159...)=-1.0) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(cos_func, col) + col = process_one_col(col) + return Func(cos_func, col) def cosh(col: "AbstractColOrName") -> Func: """ Computes hyperbolic cosine of the input AbstractCol. - - - + + + Parameters ---------- @@ -1087,21 +923,20 @@ def cosh(col: "AbstractColOrName") -> Func: >>> df.select(cosh(lit(1))).first() Row(COSH(1)=1.54308...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(cosh_func, col) + col = process_one_col(col) + return Func(cosh_func, col) def cot(col: "AbstractColOrName") -> Func: """ Computes cotangent of the input AbstractCol. - - - + + + Parameters ---------- @@ -1120,21 +955,20 @@ def cot(col: "AbstractColOrName") -> Func: >>> df.select(cot(lit(math.radians(45)))).first() Row(COT(0.78539...)=1.00000...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(cot_func, col) + col = process_one_col(col) + return Func(cot_func, col) def csc(col: "AbstractColOrName") -> Func: """ Computes cosecant of the input AbstractCol. - - - + + + Parameters ---------- @@ -1153,21 +987,20 @@ def csc(col: "AbstractColOrName") -> Func: >>> df.select(csc(lit(math.radians(90)))).first() Row(CSC(1.57079...)=1.0) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(csc_func, col) + col = process_one_col(col) + return Func(csc_func, col) def exp(col: "AbstractColOrName") -> Func: """ Computes the exponential of the given value. - - - + + + Parameters ---------- @@ -1189,21 +1022,20 @@ def exp(col: "AbstractColOrName") -> Func: | 1.0| +------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(exp_func, col) + col = process_one_col(col) + return Func(exp_func, col) def expm1(col: "AbstractColOrName") -> Func: """ Computes the exponential of the given value minus one. - - - + + + Parameters ---------- @@ -1221,21 +1053,20 @@ def expm1(col: "AbstractColOrName") -> Func: >>> df.select(expm1(lit(1))).first() Row(EXPM1(1)=1.71828...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(expm1_func, col) + col = process_one_col(col) + return Func(expm1_func, col) def floor(col: "AbstractColOrName") -> Func: """ Computes the floor of the given value. - - - + + + Parameters ---------- @@ -1257,21 +1088,20 @@ def floor(col: "AbstractColOrName") -> Func: | 2| +----------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(floor_func, col) + col = process_one_col(col) + return Func(floor_func, col) def log(col: "AbstractColOrName") -> Func: """ Computes the natural logarithm of the given value. - - - + + + Parameters ---------- @@ -1290,21 +1120,20 @@ def log(col: "AbstractColOrName") -> Func: >>> df.select(log(lit(math.e))).first() Row(ln(2.71828...)=1.0) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(log_func, col, np.e) + col = process_one_col(col) + return Func(log_func, col, np.e) def log10(col: "AbstractColOrName") -> Func: """ Computes the logarithm of the given value in Base 10. - - - + + + Parameters ---------- @@ -1326,21 +1155,20 @@ def log10(col: "AbstractColOrName") -> Func: | 2.0| +----------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(log_func, col, 10) + col = process_one_col(col) + return Func(log_func, col, 10) def log1p(col: "AbstractColOrName") -> Func: """ Computes the natural logarithm of the "given value plus one". - - - + + + Parameters ---------- @@ -1364,11 +1192,10 @@ def log1p(col: "AbstractColOrName") -> Func: >>> df.select(log(lit(math.e+1))).first() Row(ln(3.71828...)=1.31326...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(log1p_func, col) + col = process_one_col(col) + return Func(log1p_func, col) def rint(col: "AbstractColOrName") -> Func: @@ -1376,10 +1203,10 @@ def rint(col: "AbstractColOrName") -> Func: Returns the double value that is closest in value to the argument and is equal to a mathematical integer. - - - + + + Parameters ---------- @@ -1408,21 +1235,20 @@ def rint(col: "AbstractColOrName") -> Func: | 10.0| +----------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(rint_func, col) + col = process_one_col(col) + return Func(rint_func, col) def sec(col: "AbstractColOrName") -> Func: """ Computes secant of the input AbstractCol. - - - + + + Parameters ---------- @@ -1440,21 +1266,20 @@ def sec(col: "AbstractColOrName") -> Func: >>> df.select(sec(lit(1.5))).first() Row(SEC(1.5)=14.13683...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(sec_func, col) + col = process_one_col(col) + return Func(sec_func, col) def signum(col: "AbstractColOrName") -> Func: """ Computes the signum of the given value. - - - + + + Parameters ---------- @@ -1483,21 +1308,20 @@ def signum(col: "AbstractColOrName") -> Func: | 1.0| +---------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(signum_func, col) + col = process_one_col(col) + return Func(signum_func, col) def sin(col: "AbstractColOrName") -> Func: """ Computes sine of the input AbstractCol. - - - + + + Parameters ---------- @@ -1516,21 +1340,20 @@ def sin(col: "AbstractColOrName") -> Func: >>> df.select(sin(lit(math.radians(90)))).first() Row(SIN(1.57079...)=1.0) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(sin_func, col) + col = process_one_col(col) + return Func(sin_func, col) def sinh(col: "AbstractColOrName") -> Func: """ Computes hyperbolic sine of the input AbstractCol. - - - + + + Parameters ---------- @@ -1549,21 +1372,20 @@ def sinh(col: "AbstractColOrName") -> Func: >>> df.select(sinh(lit(1.1))).first() Row(SINH(1.1)=1.33564...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(sinh_func, col) + col = process_one_col(col) + return Func(sinh_func, col) def tan(col: "AbstractColOrName") -> Func: """ Computes tangent of the input AbstractCol. - - - + + + Parameters ---------- @@ -1582,21 +1404,20 @@ def tan(col: "AbstractColOrName") -> Func: >>> df.select(tan(lit(math.radians(45)))).first() Row(TAN(0.78539...)=0.99999...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(tan_func, col) + col = process_one_col(col) + return Func(tan_func, col) def tanh(col: "AbstractColOrName") -> Func: """ Computes hyperbolic tangent of the input AbstractCol. - - - + + + Parameters ---------- @@ -1616,24 +1437,22 @@ def tanh(col: "AbstractColOrName") -> Func: >>> df.select(tanh(lit(math.radians(90)))).first() Row(TANH(1.57079...)=0.91715...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(tanh_func, col) + col = process_one_col(col) + return Func(tanh_func, col) def toDegrees(col: "AbstractColOrName") -> Func: """ - - Use :func:`degrees` instead. + + Use :func:`degrees` instead. """ warn("Deprecated by Spark, use degrees instead.", FutureWarning) return degrees_func(col) - def toRadians(col: "AbstractColOrName") -> Func: """ Use :func:`radians` instead. @@ -1642,7 +1461,6 @@ def toRadians(col: "AbstractColOrName") -> Func: return radians_func(col) - def bitwiseNOT(col: "AbstractColOrName") -> Func: """ Computes bitwise not. @@ -1653,12 +1471,11 @@ def bitwiseNOT(col: "AbstractColOrName") -> Func: return bitwise_not_func(col) - def bitwise_not(col: "AbstractColOrName") -> Func: """ Computes bitwise not. - + Parameters ---------- col : :class:`~osos.Col` or str @@ -1685,11 +1502,10 @@ def bitwise_not(col: "AbstractColOrName") -> Func: | -2| +---+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(bitwise_not_func, col) + col = process_one_col(col) + return Func(bitwise_not_func, col) def asc_nulls_first(col: "AbstractColOrName") -> Func: @@ -1697,7 +1513,7 @@ def asc_nulls_first(col: "AbstractColOrName") -> Func: Returns a sort expression based on the ascending order of the given AbstractCol name, and null values return before non-null values. - + Parameters ---------- col : :class:`~osos.Col` or str @@ -1723,11 +1539,10 @@ def asc_nulls_first(col: "AbstractColOrName") -> Func: +---+-----+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(asc_func, col, nulls_first=True) + col = process_one_col(col) + return Func(asc_func, col, nulls_first=True) def asc_nulls_last(col: "AbstractColOrName") -> Func: @@ -1735,10 +1550,10 @@ def asc_nulls_last(col: "AbstractColOrName") -> Func: Returns a sort expression based on the ascending order of the given AbstractCol name, and null values appear after non-null values. - - - + + + Parameters ---------- @@ -1765,11 +1580,10 @@ def asc_nulls_last(col: "AbstractColOrName") -> Func: +---+-----+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(asc_func, col, nulls_first=False) + col = process_one_col(col) + return Func(asc_func, col, nulls_first=False) def desc_nulls_first(col: "AbstractColOrName") -> Func: @@ -1777,10 +1591,10 @@ def desc_nulls_first(col: "AbstractColOrName") -> Func: Returns a sort expression based on the descending order of the given AbstractCol name, and null values appear before non-null values. - - - + + + Parameters ---------- @@ -1807,11 +1621,10 @@ def desc_nulls_first(col: "AbstractColOrName") -> Func: +---+-----+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(desc_func, col, nulls_first=True) + col = process_one_col(col) + return Func(desc_func, col, nulls_first=True) def desc_nulls_last(col: "AbstractColOrName") -> Func: @@ -1819,10 +1632,10 @@ def desc_nulls_last(col: "AbstractColOrName") -> Func: Returns a sort expression based on the descending order of the given AbstractCol name, and null values appear after non-null values. - - - + + + Parameters ---------- @@ -1849,21 +1662,20 @@ def desc_nulls_last(col: "AbstractColOrName") -> Func: +---+-----+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(desc_func, col, nulls_first=False) + col = process_one_col(col) + return Func(desc_func, col, nulls_first=False) def stddev(col: "AbstractColOrName") -> Func: """ Aggregate function: alias for stddev_samp. - - - + + + Parameters ---------- @@ -1881,11 +1693,10 @@ def stddev(col: "AbstractColOrName") -> Func: >>> df.select(stddev(df.id)).first() Row(stddev_samp(id)=1.87082...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(stddev_func, col) + col = process_one_col(col) + return Func(stddev_func, col) def stddev_samp(col: "AbstractColOrName") -> Func: @@ -1893,10 +1704,10 @@ def stddev_samp(col: "AbstractColOrName") -> Func: Aggregate function: returns the unbiased sample standard deviation of the expression in a group. - - - + + + Parameters ---------- @@ -1914,11 +1725,10 @@ def stddev_samp(col: "AbstractColOrName") -> Func: >>> df.select(stddev_samp(df.id)).first() Row(stddev_samp(id)=1.87082...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(stddev_samp_func, col) + col = process_one_col(col) + return Func(stddev_samp_func, col) def stddev_pop(col: "AbstractColOrName") -> Func: @@ -1926,10 +1736,10 @@ def stddev_pop(col: "AbstractColOrName") -> Func: Aggregate function: returns population standard deviation of the expression in a group. - - - + + + Parameters ---------- @@ -1947,21 +1757,20 @@ def stddev_pop(col: "AbstractColOrName") -> Func: >>> df.select(stddev_pop(df.id)).first() Row(stddev_pop(id)=1.70782...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(stddev_func, col) + col = process_one_col(col) + return Func(stddev_func, col) def variance(col: "AbstractColOrName") -> Func: """ Aggregate function: alias for var_samp - - - + + + Parameters ---------- @@ -1983,11 +1792,10 @@ def variance(col: "AbstractColOrName") -> Func: | 3.5| +------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(variance_func, col) + col = process_one_col(col) + return Func(variance_func, col) def var_samp(col: "AbstractColOrName") -> Func: @@ -1995,10 +1803,10 @@ def var_samp(col: "AbstractColOrName") -> Func: Aggregate function: returns the unbiased sample variance of the values in a group. - - - + + + Parameters ---------- @@ -2020,21 +1828,20 @@ def var_samp(col: "AbstractColOrName") -> Func: | 3.5| +------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(var_samp_func, col) + col = process_one_col(col) + return Func(var_samp_func, col) def var_pop(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the population variance of the values in a group. - - - + + + Parameters ---------- @@ -2052,21 +1859,20 @@ def var_pop(col: "AbstractColOrName") -> Func: >>> df.select(var_pop(df.id)).first() Row(var_pop(id)=2.91666...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(variance_func, col) + col = process_one_col(col) + return Func(variance_func, col) def skewness(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the skewness of the values in a group. - - - + + + Parameters ---------- @@ -2084,21 +1890,20 @@ def skewness(col: "AbstractColOrName") -> Func: >>> df.select(skewness(df.c)).first() Row(skewness(c)=0.70710...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(skewness_func, col) + col = process_one_col(col) + return Func(skewness_func, col) def kurtosis(col: "AbstractColOrName") -> Func: """ Aggregate function: returns the kurtosis of the values in a group. - - - + + + Parameters ---------- @@ -2120,18 +1925,17 @@ def kurtosis(col: "AbstractColOrName") -> Func: | -1.5| +-----------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(kurtosis_func, col) + col = process_one_col(col) + return Func(kurtosis_func, col) def collect_list(col: "AbstractColOrName") -> Func: """ Aggregate function: returns a list of objects with duplicates. - + Notes ----- @@ -2154,20 +1958,19 @@ def collect_list(col: "AbstractColOrName") -> Func: >>> df2.agg(collect_list('age')).collect() [Row(collect_list(age)=[2, 5, 5])] """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(collect_list_func, col) + col = process_one_col(col) + return Func(collect_list_func, col) def collect_set(col: "AbstractColOrName") -> Func: """ Aggregate function: returns a set of objects with duplicate elements eliminated. - - - + + + Notes ----- @@ -2190,10 +1993,9 @@ def collect_set(col: "AbstractColOrName") -> Func: >>> df2.agg(array_sort(collect_set('age')).alias('c')).collect() [Row(c=[2, 5])] """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(collect_set_func, col) + col = process_one_col(col) + return Func(collect_set_func, col) def degrees(col: "AbstractColOrName") -> Func: @@ -2201,10 +2003,10 @@ def degrees(col: "AbstractColOrName") -> Func: Converts an angle measured in radians to an approximately equivalent angle measured in degrees. - - - + + + Parameters ---------- @@ -2223,11 +2025,10 @@ def degrees(col: "AbstractColOrName") -> Func: >>> df.select(degrees(lit(math.pi))).first() Row(DEGREES(3.14159...)=180.0) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(degrees_func, col) + col = process_one_col(col) + return Func(degrees_func, col) def radians(col: "AbstractColOrName") -> Func: @@ -2235,10 +2036,10 @@ def radians(col: "AbstractColOrName") -> Func: Converts an angle measured in degrees to an approximately equivalent angle measured in radians. - - - + + + Parameters ---------- @@ -2256,21 +2057,20 @@ def radians(col: "AbstractColOrName") -> Func: >>> df.select(radians(lit(180))).first() Row(RADIANS(180)=3.14159...) """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(radians_func, col) + col = process_one_col(col) + return Func(radians_func, col) def atan2( col1: Union["AbstractColOrName", float], col2: Union["AbstractColOrName", float] ) -> Func: """ - - - + + + Parameters ---------- @@ -2303,17 +2103,16 @@ def atan2( return Func(atan2_func, col1, col2) - def hypot( col1: Union["AbstractColOrName", float], col2: Union["AbstractColOrName", float] ) -> Func: """ Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow. - - - + + + Parameters ---------- @@ -2342,17 +2141,16 @@ def hypot( return Func(hypot_func, col1, col2) - def pow( col1: Union["AbstractColOrName", float], col2: Union["AbstractColOrName", float] ) -> Func: """ Returns the value of the first argument raised to the power of the second argument. - - - + + + Parameters ---------- @@ -2381,7 +2179,6 @@ def pow( return Func(pow_func, col1, col2) - def pmod( dividend: Union["AbstractColOrName", float], divisor: Union["AbstractColOrName", float], @@ -2389,10 +2186,10 @@ def pmod( """ Returns the positive value of dividend mod divisor. - - - + + + Parameters ---------- @@ -2439,7 +2236,6 @@ def pmod( return Func(pmod_func, col1, col2) - def row_number() -> Func: """ Window function: returns a sequential number starting at 1 within a window partition. @@ -2467,7 +2263,6 @@ def row_number() -> Func: return Func(row_number_func, AbstractIndex()) - def dense_rank() -> Func: """ Window function: returns the rank of rows within a window partition, without any gaps. @@ -2480,10 +2275,10 @@ def dense_rank() -> Func: This is equivalent to the DENSE_RANK function in SQL. - - - + + + Returns ------- @@ -2510,7 +2305,6 @@ def dense_rank() -> Func: return Func(dense_rank_func, AbstractIndex()) - def rank() -> Func: """ Window function: returns the rank of rows within a window partition. @@ -2523,10 +2317,10 @@ def rank() -> Func: This is equivalent to the RANK function in SQL. - - - + + + Returns ------- @@ -2553,16 +2347,15 @@ def rank() -> Func: return Func(rank_func, AbstractIndex()) - def cume_dist() -> Func: """ Window function: returns the cumulative distribution of values within a window partition, i.e. the fraction of rows that are below the current row. - - - + + + Returns ------- @@ -2588,15 +2381,14 @@ def cume_dist() -> Func: return Func(cume_dist_func, AbstractIndex()) - def percent_rank() -> Func: """ Window function: returns the relative rank (i.e. percentile) of rows within a window partition. - - - + + + Returns ------- @@ -2623,16 +2415,14 @@ def percent_rank() -> Func: return Func(percent_rank_func(), AbstractIndex) - def approxCountDistinct(col: "AbstractColOrName", rsd: Optional[float] = None) -> Func: """ Use :func:`approx_count_distinct` instead. """ warn("Deprecated by Spark, use approx_count_distinct instead.", FutureWarning) - if isinstance(col, str): - col = AbstractCol(col) - return Func(approx_count_distinct_func, col, rsd) + col = process_one_col(col) + return Func(approx_count_distinct_func, col, rsd) def approx_count_distinct( @@ -2664,18 +2454,16 @@ def approx_count_distinct( | 3| +---------------+ """ - if isinstance(col, str): - col = AbstractCol(col) + col = process_one_col(col) return Func(approx_count_distinct_func, col, rsd if rsd is not None else 0.05) - def broadcast(df: DataFrame) -> DataFrame: """ Marks a DataFrame as small enough for use in broadcast joins. - + Returns ------- :class:`~osos.DataFrame` @@ -2699,15 +2487,9 @@ def broadcast(df: DataFrame) -> DataFrame: return df - def coalesce(*cols: "AbstractColOrName") -> Func: """Returns the first AbstractCol that is not null. - - - - - Parameters ---------- cols : :class:`~osos.Col` or str @@ -2748,23 +2530,14 @@ def coalesce(*cols: "AbstractColOrName") -> Func: |null| 2| 0.0| +----+----+----------------+ """ - for col in cols: - if isinstance(col, str): - col = AbstractCol(col) - + cols = flatten_and_process_cols(cols) return Func(coalesce_func, cols) - def corr(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: """Returns a new :class:`~osos.Col` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. - - - - - Parameters ---------- col1 : :class:`~osos.Col` or str @@ -2785,19 +2558,15 @@ def corr(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: >>> df.agg(corr("a", "b").alias('c')).collect() [Row(c=1.0)] """ - if isinstance(col1, str): - col1 = AbstractCol(col1) - if isinstance(col2, str): - col2 = AbstractCol(col2) + col1, col2 = flatten_and_process_cols([col1, col2]) return Func(corr_func, col1, col2) - def covar_pop(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: """Returns a new :class:`~osos.Col` for the population covariance of ``col1`` and ``col2``. - + Parameters ---------- col1 : :class:`~osos.Col` or str @@ -2818,23 +2587,14 @@ def covar_pop(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: >>> df.agg(covar_pop("a", "b").alias('c')).collect() [Row(c=0.0)] """ - if isinstance(col1, str): - col1 = AbstractCol(col1) - if isinstance(col2, str): - col2 = AbstractCol(col2) + col1, col2 = flatten_and_process_cols([col1, col2]) return Func(covar_pop_func, col1, col2) - def covar_samp(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: """Returns a new :class:`~osos.Col` for the sample covariance of ``col1`` and ``col2``. - - - - - Parameters ---------- col1 : :class:`~osos.Col` or str @@ -2855,40 +2615,24 @@ def covar_samp(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: >>> df.agg(covar_samp("a", "b").alias('c')).collect() [Row(c=0.0)] """ - if isinstance(col1, str): - col1 = AbstractCol(col1) - if isinstance(col2, str): - col2 = AbstractCol(col2) + col1, col2 = flatten_and_process_cols([col1, col2]) return Func(covar_samp_func, col1, col2) - def countDistinct(col: "AbstractColOrName", *cols: "AbstractColOrName") -> Func: """Returns a new :class:`~osos.Col` for distinct count of ``col`` or ``cols``. An alias of :func:`count_distinct`, and it is encouraged to use :func:`count_distinct` directly. - - - - """ - if isinstance(col, str): - col = AbstractCol(col) - for col in cols: - col = AbstractCol(col) - return Func(count_distinct_func, col, *cols) - + cols = flatten_and_process_cols([col, *cols]) + return Func(count_distinct_func, *cols) def count_distinct(col: "AbstractColOrName", *cols: "AbstractColOrName") -> Func: """Returns a new :class:`AbstractCol` for distinct count of ``col`` or ``cols``. - - - - Parameters ---------- @@ -2925,13 +2669,8 @@ def count_distinct(col: "AbstractColOrName", *cols: "AbstractColOrName") -> Func | 4| +----------------------------+ """ - cols = list(cols) - if isinstance(col, str): - col = AbstractCol(col) - for col_ in cols: - col_ = AbstractCol(col_) - return Func(count_distinct_func, col, *cols) - + cols = flatten_and_process_cols([col, *cols]) + return Func(count_distinct_func, *cols) def first(col: "AbstractColOrName", ignorenulls: bool = False) -> Func: @@ -2940,10 +2679,6 @@ def first(col: "AbstractColOrName", ignorenulls: bool = False) -> Func: The function by default returns the first values it sees. It will return the first non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned. - - - - Notes ----- @@ -2984,11 +2719,9 @@ def first(col: "AbstractColOrName", ignorenulls: bool = False) -> Func: | Bob| 5| +-----+----------+ """ - - if isinstance(col, str): - col = AbstractCol(col) - return Func(first_func, col) + col = process_one_col(col) + return Func(first_func, col) def grouping(col: "AbstractColOrName") -> Func: @@ -2996,10 +2729,6 @@ def grouping(col: "AbstractColOrName") -> Func: Aggregate function: indicates whether a specified AbstractCol in a GROUP BY list is aggregated or not, returns 1 for aggregated or 0 for not aggregated in the result set. - - - - Parameters ---------- @@ -3023,10 +2752,9 @@ def grouping(col: "AbstractColOrName") -> Func: | Bob| 0| 5| +-----+--------------+--------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(grouping_func, col) + col = process_one_col(col) + return Func(grouping_func, col) def grouping_id(*cols: "AbstractColOrName") -> Func: @@ -3035,7 +2763,7 @@ def grouping_id(*cols: "AbstractColOrName") -> Func: (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) - + Notes ----- The list of AbstractCols should match with grouping AbstractCols exactly, or empty (means all @@ -3069,15 +2797,14 @@ def grouping_id(*cols: "AbstractColOrName") -> Func: | b| c| 0| 4| +----+----+-------------+-------+ """ - return Func(grouping_id_func(), AbstractIndex()) - + return Func(grouping_id_func, AbstractIndex()) def input_file_name() -> Func: """ Creates a string AbstractCol for the file name of the current Spark task. - + Returns @@ -3096,11 +2823,10 @@ def input_file_name() -> Func: raise NotImplementedError - def isnan(col: "AbstractColOrName") -> Func: """An expression that returns true if the AbstractCol is NaN. - + Parameters ---------- @@ -3123,10 +2849,9 @@ def isnan(col: "AbstractColOrName") -> Func: |NaN|2.0| true|false| +---+---+-----+-----+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(isnull_func, col) + col = process_one_col(col) + return Func(isnull_func, col) def isnull(col: "AbstractColOrName") -> Func: @@ -3154,10 +2879,9 @@ def isnull(col: "AbstractColOrName") -> Func: |null| 2| true|false| +----+----+-----+-----+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(isnull_func, col) + col = process_one_col(col) + return Func(isnull_func, col) def last(col: "AbstractColOrName", ignorenulls: bool = False) -> Func: @@ -3166,11 +2890,6 @@ def last(col: "AbstractColOrName", ignorenulls: bool = False) -> Func: The function by default returns the last values it sees. It will return the last non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned. - - - - - Notes ----- The function is non-deterministic because its results depends on the order of the @@ -3210,10 +2929,9 @@ def last(col: "AbstractColOrName", ignorenulls: bool = False) -> Func: | Bob| 5| +-----+---------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(last_func, col, ignorenulls=ignorenulls) + col = process_one_col(col) + return Func(last_func, col, ignorenulls=ignorenulls) def monotonically_increasing_id() -> Func: @@ -3233,7 +2951,6 @@ def monotonically_increasing_id() -> Func: return Func(monotonically_increasing_id_func, AbstractIndex()) - def nanvl(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: """Returns col1 if it is not NaN, or col2 if col1 is NaN. @@ -3258,14 +2975,10 @@ def nanvl(col1: "AbstractColOrName", col2: "AbstractColOrName") -> Func: >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect() [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)] """ - if isinstance(col1, str): - col1 = AbstractCol(col1) - if isinstance(col2, str): - col2 = AbstractCol(col2) + col1, col2 = flatten_and_process_cols([col1, col2]) return Func(nanvl_func, col1, col2) - def percentile_approx( col: "AbstractColOrName", percentage: Union[AbstractCol, float, List[float], Tuple[float]], @@ -3315,27 +3028,18 @@ def percentile_approx( |-- median: double (nullable = true) """ - if isinstance(percentage, AbstractCol): - # Already a AbstractCol - percentage = AbstractCol(percentage) - else: - # Probably scalar - percentage = AbstractLit(percentage) - - accuracy = ( - accuracy if isinstance(accuracy, AbstractCol) - else AbstractLit(accuracy) - ) + col = process_one_col(col) + percentage = process_one_col_or_lit(percentage) + accuracy = process_one_col_or_lit(accuracy) return percentile_approx_func(col, percentage, accuracy) - def rand(seed: Optional[int] = None) -> Func: """Generates a random AbstractCol with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0). - + Notes ----- The function is non-deterministic in general case. @@ -3367,12 +3071,11 @@ def rand(seed: Optional[int] = None) -> Func: return Func(rand_func, seed=0) - def randn(seed: Optional[int] = None) -> Func: """Generates a AbstractCol with independent and identically distributed (i.i.d.) samples from the standard normal distribution. - + Notes ----- @@ -3405,13 +3108,12 @@ def randn(seed: Optional[int] = None) -> Func: return Func(randn_func, seed) - def round(col: "AbstractColOrName", scale: int = 0) -> Func: """ Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0 or at integral part when `scale` < 0. - + Parameters ---------- col : :class:`~osos.Col` or str @@ -3429,8 +3131,9 @@ def round(col: "AbstractColOrName", scale: int = 0) -> Func: >>> OsosSession.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect() [Row(r=3.0)] """ - return round_func(col,scale=scale,mode="HALF_UP") - + col = process_one_col(col) + scale = process_one_col_or_lit(scale) + return round_func(col, scale=scale, mode="HALF_UP") def bround(col: "AbstractColOrName", scale: int = 0) -> Func: @@ -3455,8 +3158,9 @@ def bround(col: "AbstractColOrName", scale: int = 0) -> Func: >>> OsosSession.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect() [Row(r=2.0)] """ - return round_func(col,scale=scale,mode="HALF_EVEN") - + col = process_one_col(col) + scale = process_one_col_or_lit(scale) + return round_func(col, scale=scale, mode="HALF_EVEN") def shiftLeft(col: "AbstractColOrName", numBits: int) -> Func: @@ -3466,10 +3170,10 @@ def shiftLeft(col: "AbstractColOrName", numBits: int) -> Func: Use :func:`shiftleft` instead. """ warn("Deprecated, use shiftleft instead.", FutureWarning) + col = process_one_col(col) return shiftleft_func(col, numBits) - def shiftleft(col: "AbstractColOrName", numBits: int) -> Func: """Shift the given value numBits left. @@ -3490,18 +3194,17 @@ def shiftleft(col: "AbstractColOrName", numBits: int) -> Func: >>> OsosSession.createDataFrame([(21,)], ['a']).select(shiftleft('a', 1).alias('r')).collect() [Row(r=42)] """ + col = process_one_col(col) return shiftleft_func(col, numBits) - def shiftRight(col: "AbstractColOrName", numBits: int) -> Func: - """(Signed) shift the given value numBits right. - """ + """(Signed) shift the given value numBits right.""" warn("Deprecated, use shiftright instead.", FutureWarning) + col = process_one_col(col) return shiftright_func(col, numBits) - def shiftright(col: "AbstractColOrName", numBits: int) -> Func: """(Signed) shift the given value numBits right. @@ -3522,17 +3225,17 @@ def shiftright(col: "AbstractColOrName", numBits: int) -> Func: >>> OsosSession.createDataFrame([(42,)], ['a']).select(shiftright('a', 1).alias('r')).collect() [Row(r=21)] """ + col = process_one_col(col) return shiftright_func(col, numBits) def shiftRightUnsigned(col: "AbstractColOrName", numBits: int) -> Func: - """Unsigned shift the given value numBits right. - """ + """Unsigned shift the given value numBits right.""" warn("Deprecated, use shiftrightunsigned instead.", FutureWarning) + col = process_one_col(col) return shiftrightunsigned_func(col, numBits) - def shiftrightunsigned(col: "AbstractColOrName", numBits: int) -> Func: """Unsigned shift the given value numBits right. @@ -3554,10 +3257,10 @@ def shiftrightunsigned(col: "AbstractColOrName", numBits: int) -> Func: >>> df.select(shiftrightunsigned('a', 1).alias('r')).collect() [Row(r=9223372036854775787)] """ + col = process_one_col(col) return shiftright_func(col, numBits) - def spark_partition_id() -> Func: """A AbstractCol for partition ID. @@ -3579,7 +3282,6 @@ def spark_partition_id() -> Func: raise NotImplementedError("Spark function not applicable to `osos`") - def expr(str: str) -> Func: """Parses the expression string into the AbstractCol that it represents @@ -3619,7 +3321,6 @@ def struct( ... - def struct( *cols: Union[ "AbstractColOrName", @@ -3651,7 +3352,6 @@ def struct( raise NotImplementedError - def greatest(*cols: "AbstractColOrName") -> Func: """ Returns the greatest value of the list of AbstractCol names, skipping null values. @@ -3679,11 +3379,10 @@ def greatest(*cols: "AbstractColOrName") -> Func: error_class="WRONG_NUM_AbstractColS", message_parameters={"func_name": "greatest", "num_cols": "2"}, ) - cols = flatten_and_process(cols) + cols = flatten_and_process_cols(cols) return greatest_func(cols) - def least(*cols: "AbstractColOrName") -> Func: """ Returns the least value of the list of AbstractCol names, skipping null values. @@ -3711,7 +3410,7 @@ def least(*cols: "AbstractColOrName") -> Func: error_class="WRONG_NUM_AbstractColS", message_parameters={"func_name": "least", "num_cols": "2"}, ) - cols = flatten_and_process(cols) + cols = flatten_and_process_cols(cols) return least_func(cols) @@ -3754,7 +3453,7 @@ def when(condition: AbstractCol, value: Any) -> Func: | 3| +----+ """ - return When(condition,value) + return When(condition, value) @overload # type: ignore[no-redef] @@ -3767,7 +3466,6 @@ def log(arg1: float, arg2: "AbstractColOrName") -> Func: ... - def log( arg1: Union["AbstractColOrName", float], arg2: Optional["AbstractColOrName"] = None ) -> Func: @@ -3775,7 +3473,7 @@ def log( If there is only one argument, then this takes the natural logarithm of the argument. - + Parameters ---------- arg1 : :class:`~osos.Col`, str or float @@ -3811,14 +3509,15 @@ def log( |4.605170185988092| +-----------------+ """ + arg1 = process_one_col_or_lit(arg1) + arg2 = arg2 if arg2 is None else process_one_col_or_lit(arg2) if arg2 is None: - return log_func(arg1,base=np.e) + return log_func(arg1, base=np.e) else: return log_func(arg2, base=arg1) - def log2(col: "AbstractColOrName") -> Func: """Returns the base-2 logarithm of the argument. @@ -3843,8 +3542,8 @@ def log2(col: "AbstractColOrName") -> Func: | 2.0| +----+ """ - return log_func(col,AbstractLit(2)) - + col = process_one_col(col) + return log_func(col, AbstractLit(2)) def conv(col: "AbstractColOrName", fromBase: int, toBase: int) -> Func: @@ -3875,15 +3574,10 @@ def conv(col: "AbstractColOrName", fromBase: int, toBase: int) -> Func: raise NotImplementedError - def factorial(col: "AbstractColOrName") -> Func: """ Computes the factorial of the given value. - - - - Parameters ---------- @@ -3907,7 +3601,6 @@ def factorial(col: "AbstractColOrName") -> Func: # --------------- Window functions ------------------------ - def lag( col: "AbstractColOrName", offset: int = 1, default: Optional[Any] = None ) -> Func: @@ -3918,11 +3611,6 @@ def lag( This is equivalent to the LAG function in SQL. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -3987,8 +3675,8 @@ def lag( | b| 8| -1| +---+---+-------------+ """ - if isinstance(col, str): - col = AbstractCol(col) + + col = process_one_col(col) if not isinstance(offset, Node): offset = SimpleContainer(offset, ()) if not isinstance(default, Node): @@ -3996,7 +3684,6 @@ def lag( return Func(lag_func, col, offset, default) - def lead( col: "AbstractColOrName", offset: int = 1, default: Optional[Any] = None ) -> Func: @@ -4007,11 +3694,6 @@ def lead( This is equivalent to the LEAD function in SQL. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4076,8 +3758,8 @@ def lead( | b| 8| -1| +---+---+----------+ """ - if isinstance(col, str): - col = AbstractCol(col) + + col = process_one_col(col) if not isinstance(offset, Node): offset = SimpleContainer(offset, ()) if not isinstance(default, Node): @@ -4085,7 +3767,6 @@ def lead( return Func(lead_func, col, offset) - def nth_value( col: "AbstractColOrName", offset: int, ignoreNulls: Optional[bool] = False ) -> Func: @@ -4098,11 +3779,6 @@ def nth_value( This is equivalent to the nth_value function in SQL. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4161,7 +3837,6 @@ def nth_value( raise NotImplementedError - def ntile(n: int) -> Func: """ Window function: returns the ntile group id (from 1 to `n` inclusive) @@ -4171,10 +3846,6 @@ def ntile(n: int) -> Func: This is equivalent to the NTILE function in SQL. - - - - Parameters ---------- @@ -4222,17 +3893,11 @@ def ntile(n: int) -> Func: # ---------------------- Date/Timestamp functions ------------------------------ - def current_date() -> Func: """ Returns the current date at the start of query evaluation as a :class:`DateType` AbstractCol. All calls of current_date within the same query return the same value. - - - - - Returns ------- :class:`~osos.Col` @@ -4251,17 +3916,11 @@ def current_date() -> Func: raise NotImplementedError - def current_timestamp() -> Func: """ Returns the current timestamp at the start of query evaluation as a :class:`TimestampType` AbstractCol. All calls of current_timestamp within the same query return the same value. - - - - - Returns ------- :class:`~osos.Col` @@ -4280,18 +3939,12 @@ def current_timestamp() -> Func: raise NotImplementedError - def localtimestamp() -> Func: """ Returns the current timestamp without time zone at the start of query evaluation as a timestamp without time zone AbstractCol. All calls of localtimestamp within the same query return the same value. - - - - - Returns ------- :class:`~osos.Col` @@ -4310,7 +3963,6 @@ def localtimestamp() -> Func: raise NotImplementedError - def date_format(date: "AbstractColOrName", format: str) -> Func: """ Converts a date/timestamp/string to a value of string in the format specified by the date @@ -4321,11 +3973,6 @@ def date_format(date: "AbstractColOrName", format: str) -> Func: .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - - - - - Notes ----- Whenever possible, use specialized functions like `year`. @@ -4351,16 +3998,10 @@ def date_format(date: "AbstractColOrName", format: str) -> Func: raise NotImplementedError - def year(col: "AbstractColOrName") -> Func: """ Extract the year of a given date/timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4380,16 +4021,10 @@ def year(col: "AbstractColOrName") -> Func: raise NotImplementedError - def quarter(col: "AbstractColOrName") -> Func: """ Extract the quarter of a given date/timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4409,16 +4044,10 @@ def quarter(col: "AbstractColOrName") -> Func: raise NotImplementedError - def month(col: "AbstractColOrName") -> Func: """ Extract the month of a given date/timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4438,17 +4067,11 @@ def month(col: "AbstractColOrName") -> Func: raise NotImplementedError - def dayofweek(col: "AbstractColOrName") -> Func: """ Extract the day of the week of a given date/timestamp as integer. Ranges from 1 for a Sunday through to 7 for a Saturday - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4468,16 +4091,10 @@ def dayofweek(col: "AbstractColOrName") -> Func: raise NotImplementedError - def dayofmonth(col: "AbstractColOrName") -> Func: """ Extract the day of the month of a given date/timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4497,16 +4114,10 @@ def dayofmonth(col: "AbstractColOrName") -> Func: raise NotImplementedError - def dayofyear(col: "AbstractColOrName") -> Func: """ Extract the day of the year of a given date/timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4526,16 +4137,10 @@ def dayofyear(col: "AbstractColOrName") -> Func: raise NotImplementedError - def hour(col: "AbstractColOrName") -> Func: """ Extract the hours of a given timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4556,16 +4161,10 @@ def hour(col: "AbstractColOrName") -> Func: raise NotImplementedError - def minute(col: "AbstractColOrName") -> Func: """ Extract the minutes of a given timestamp as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4586,16 +4185,10 @@ def minute(col: "AbstractColOrName") -> Func: raise NotImplementedError - def second(col: "AbstractColOrName") -> Func: """ Extract the seconds of a given date as integer. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4616,18 +4209,12 @@ def second(col: "AbstractColOrName") -> Func: raise NotImplementedError - def weekofyear(col: "AbstractColOrName") -> Func: """ Extract the week number of a given date as integer. A week is considered to start on a Monday and week 1 is the first week with more than 3 days, as defined by ISO 8601 - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4647,18 +4234,12 @@ def weekofyear(col: "AbstractColOrName") -> Func: raise NotImplementedError - def make_date( year: "AbstractColOrName", month: "AbstractColOrName", day: "AbstractColOrName" ) -> Func: """ Returns a AbstractCol with a date built from the year, month and day AbstractCols. - - - - - Parameters ---------- year : :class:`~osos.Col` or str @@ -4682,17 +4263,11 @@ def make_date( raise NotImplementedError - def date_add(start: "AbstractColOrName", days: Union["AbstractColOrName", int]) -> Func: """ Returns the date that is `days` days after `start`. If `days` is a negative value then these amount of days will be deducted from `start`. - - - - - Parameters ---------- start : :class:`~osos.Col` or str @@ -4720,17 +4295,11 @@ def date_add(start: "AbstractColOrName", days: Union["AbstractColOrName", int]) raise NotImplementedError - def date_sub(start: "AbstractColOrName", days: Union["AbstractColOrName", int]) -> Func: """ Returns the date that is `days` days before `start`. If `days` is a negative value then these amount of days will be added to `start`. - - - - - Parameters ---------- start : :class:`~osos.Col` or str @@ -4758,16 +4327,10 @@ def date_sub(start: "AbstractColOrName", days: Union["AbstractColOrName", int]) raise NotImplementedError - def datediff(end: "AbstractColOrName", start: "AbstractColOrName") -> Func: """ Returns the number of days from `start` to `end`. - - - - - Parameters ---------- end : :class:`~osos.Col` or str @@ -4789,7 +4352,6 @@ def datediff(end: "AbstractColOrName", start: "AbstractColOrName") -> Func: raise NotImplementedError - def add_months( start: "AbstractColOrName", months: Union["AbstractColOrName", int] ) -> Func: @@ -4797,11 +4359,6 @@ def add_months( Returns the date that is `months` months after `start`. If `months` is a negative value then these amount of months will be deducted from the `start`. - - - - - Parameters ---------- start : :class:`~osos.Col` or str @@ -4829,7 +4386,6 @@ def add_months( raise NotImplementedError - def months_between( date1: "AbstractColOrName", date2: "AbstractColOrName", roundOff: bool = True ) -> Func: @@ -4840,11 +4396,6 @@ def months_between( of their respective months. Otherwise, the difference is calculated assuming 31 days per month. The result is rounded off to 8 digits unless `roundOff` is set to `False`. - - - - - Parameters ---------- date1 : :class:`~osos.Col` or str @@ -4872,7 +4423,6 @@ def months_between( ) - def to_date(col: "AbstractColOrName", format: Optional[str] = None) -> Func: """Converts a :class:`~osos.Col` into :class:`pyspark.sql.types.DateType` using the optionally specified format. Specify formats according to `datetime pattern`_. @@ -4881,11 +4431,6 @@ def to_date(col: "AbstractColOrName", format: Optional[str] = None) -> Func: .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -4924,7 +4469,6 @@ def to_timestamp(col: "AbstractColOrName", format: str) -> Func: ... - def to_timestamp(col: "AbstractColOrName", format: Optional[str] = None) -> Func: """Converts a :class:`~osos.Col` into :class:`pyspark.sql.types.TimestampType` using the optionally specified format. Specify formats according to `datetime pattern`_. @@ -4933,10 +4477,6 @@ def to_timestamp(col: "AbstractColOrName", format: Optional[str] = None) -> Func .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html - - - - Parameters ---------- @@ -4966,16 +4506,10 @@ def to_timestamp(col: "AbstractColOrName", format: Optional[str] = None) -> Func raise NotImplementedError - def trunc(date: "AbstractColOrName", format: str) -> Func: """ Returns date truncated to the unit specified by the format. - - - - - Parameters ---------- date : :class:`~osos.Col` or str @@ -5001,16 +4535,10 @@ def trunc(date: "AbstractColOrName", format: str) -> Func: raise NotImplementedError - def date_trunc(format: str, timestamp: "AbstractColOrName") -> Func: """ Returns timestamp truncated to the unit specified by the format. - - - - - Parameters ---------- format : str @@ -5038,17 +4566,11 @@ def date_trunc(format: str, timestamp: "AbstractColOrName") -> Func: raise NotImplementedError - def next_day(date: "AbstractColOrName", dayOfWeek: str) -> Func: """ Returns the first date which is later than the value of the date AbstractCol based on second `week day` argument. - - - - - Parameters ---------- date : :class:`~osos.Col` or str @@ -5071,16 +4593,10 @@ def next_day(date: "AbstractColOrName", dayOfWeek: str) -> Func: raise NotImplementedError - def last_day(date: "AbstractColOrName") -> Func: """ Returns the last day of the month which the given date belongs to. - - - - - Parameters ---------- date : :class:`~osos.Col` or str @@ -5100,7 +4616,6 @@ def last_day(date: "AbstractColOrName") -> Func: raise NotImplementedError - def from_unixtime( timestamp: "AbstractColOrName", format: str = "yyyy-MM-dd HH:mm:ss" ) -> Func: @@ -5109,11 +4624,6 @@ def from_unixtime( representing the timestamp of that moment in the current system time zone in the given format. - - - - - Parameters ---------- timestamp : :class:`~osos.Col` or str @@ -5147,7 +4657,6 @@ def unix_timestamp() -> Func: ... - def unix_timestamp( timestamp: Optional["AbstractColOrName"] = None, format: str = "yyyy-MM-dd HH:mm:ss" ) -> Func: @@ -5158,10 +4667,6 @@ def unix_timestamp( if `timestamp` is None, then it returns current timestamp. - - - - Parameters ---------- @@ -5188,7 +4693,6 @@ def unix_timestamp( raise NotImplementedError - def from_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") -> Func: """ This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function @@ -5204,10 +4708,6 @@ def from_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") according to the timezone in the string, and finally display the result by converting the timestamp to string according to the session local timezone. - - - - Parameters ---------- @@ -5221,7 +4721,7 @@ def from_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") supported as aliases of '+00:00'. Other short names are not recommended to use because they can be ambiguous. - + `tz` can take a :class:`~osos.Col` containing timezone ID strings. Returns @@ -5242,7 +4742,6 @@ def from_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") raise NotImplementedError - def to_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") -> Func: """ This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function @@ -5258,10 +4757,6 @@ def to_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") -> according to the timezone in the string, and finally display the result by converting the timestamp to string according to the session local timezone. - - - - Parameters ---------- @@ -5275,7 +4770,7 @@ def to_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") -> supported as aliases of '+00:00'. Other short names are not recommended to use because they can be ambiguous. - + `tz` can take a :class:`~osos.Col` containing timezone ID strings. Returns @@ -5296,16 +4791,11 @@ def to_utc_timestamp(timestamp: "AbstractColOrName", tz: "AbstractColOrName") -> raise NotImplementedError - def timestamp_seconds(col: "AbstractColOrName") -> Func: """ Converts the number of seconds from the Unix epoch (1970-01-01T00:00:00Z) to a timestamp. - - - - Parameters ---------- @@ -5337,7 +4827,6 @@ def timestamp_seconds(col: "AbstractColOrName") -> Func: raise NotImplementedError - def window( timeAbstractCol: "AbstractColOrName", windowDuration: str, @@ -5362,10 +4851,6 @@ def window( The output AbstractCol will be a struct called 'window' by default with the nested AbstractCols 'start' and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`. - - - - Parameters ---------- @@ -5433,7 +4918,6 @@ def check_string_field(field, fieldName): # type: ignore[no-untyped-def] raise NotImplementedError - def window_time( windowAbstractCol: "AbstractColOrName", ) -> Func: @@ -5444,11 +4928,6 @@ def window_time( ``window.end - lit(1).alias("microsecond")`` (as microsecond is the minimal supported event time precision). The window AbstractCol must be one produced by a window aggregating operator. - - - - - Parameters ---------- windowAbstractCol : :class:`~osos.Col` @@ -5483,7 +4962,6 @@ def window_time( raise NotImplementedError - def session_window( timeAbstractCol: "AbstractColOrName", gapDuration: Union[AbstractCol, str] ) -> Func: @@ -5504,11 +4982,6 @@ def session_window( The output AbstractCol will be a struct called 'session_window' by default with the nested AbstractCols 'start' and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`. - - - - - Parameters ---------- timeAbstractCol : :class:`~osos.Col` or str @@ -5558,15 +5031,11 @@ def check_field(field: Union[AbstractCol, str], fieldName: str) -> None: # ---------------------------- misc functions ---------------------------------- - def crc32(col: "AbstractColOrName") -> Func: """ Calculates the cyclic redundancy check value (CRC32) of a binary AbstractCol and returns the value as a bigint. - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5577,7 +5046,7 @@ def crc32(col: "AbstractColOrName") -> Func: :class:`~osos.Col` the AbstractCol for computed results. - + Examples -------- @@ -5587,15 +5056,8 @@ def crc32(col: "AbstractColOrName") -> Func: raise NotImplementedError - def md5(col: "AbstractColOrName") -> Func: """Calculates the MD5 digest and returns the value as a 32 character hex string. - - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5614,15 +5076,9 @@ def md5(col: "AbstractColOrName") -> Func: raise NotImplementedError - def sha1(col: "AbstractColOrName") -> Func: """Returns the hex string result of SHA-1. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5641,17 +5097,11 @@ def sha1(col: "AbstractColOrName") -> Func: raise NotImplementedError - def sha2(col: "AbstractColOrName", numBits: int) -> Func: """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, and SHA-512). The numBits indicates the desired bit length of the result, which must have a value of 224, 256, 384, 512, or 0 (which is equivalent to 256). - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5679,15 +5129,9 @@ def sha2(col: "AbstractColOrName", numBits: int) -> Func: raise NotImplementedError - def hash(*cols: "AbstractColOrName") -> Func: """Calculates the hash code of given AbstractCols, and returns the result as an int AbstractCol. - - - - - Parameters ---------- cols : :class:`~osos.Col` or str @@ -5723,16 +5167,10 @@ def hash(*cols: "AbstractColOrName") -> Func: raise NotImplementedError - def xxhash64(*cols: "AbstractColOrName") -> Func: """Calculates the hash code of given AbstractCols using the 64-bit variant of the xxHash algorithm, and returns the result as a long AbstractCol. The hash computation uses an initial seed of 42. - - - - - Parameters ---------- cols : :class:`~osos.Col` or str @@ -5768,7 +5206,6 @@ def xxhash64(*cols: "AbstractColOrName") -> Func: raise NotImplementedError - def assert_true( col: "AbstractColOrName", errMsg: Optional[Union[AbstractCol, str]] = None ) -> Func: @@ -5776,11 +5213,6 @@ def assert_true( Returns `null` if the input AbstractCol is `true`; throws an exception with the provided error message otherwise. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5826,16 +5258,10 @@ def assert_true( raise NotImplementedError - def raise_error(errMsg: Union[AbstractCol, str]) -> Func: """ Throws an exception with the provided error message. - - - - - Parameters ---------- errMsg : :class:`~osos.Col` or str @@ -5870,16 +5296,10 @@ def raise_error(errMsg: Union[AbstractCol, str]) -> Func: # ---------------------- String/Binary functions ------------------------------ - def upper(col: "AbstractColOrName") -> Func: """ Converts a string expression to upper case. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5902,22 +5322,16 @@ def upper(col: "AbstractColOrName") -> Func: | PANDAS API| +------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(upper_func, col) + col = process_one_col(col) + return Func(upper_func, col) def lower(col: "AbstractColOrName") -> Func: """ Converts a string expression to lower case. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5940,22 +5354,16 @@ def lower(col: "AbstractColOrName") -> Func: | pandas api| +------------+ """ - if isinstance(col, str): - col = AbstractCol(col) - return Func(lower_func, col) + col = process_one_col(col) + return Func(lower_func, col) def ascii(col: "AbstractColOrName") -> Func: """ Computes the numeric value of the first character of the string AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -5981,16 +5389,10 @@ def ascii(col: "AbstractColOrName") -> Func: raise NotImplementedError - def base64(col: "AbstractColOrName") -> Func: """ Computes the BASE64 encoding of a binary AbstractCol and returns it as a string AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6016,16 +5418,10 @@ def base64(col: "AbstractColOrName") -> Func: raise NotImplementedError - def unbase64(col: "AbstractColOrName") -> Func: """ Decodes a BASE64 encoded string AbstractCol and returns it as a binary AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6053,16 +5449,10 @@ def unbase64(col: "AbstractColOrName") -> Func: raise NotImplementedError - def ltrim(col: "AbstractColOrName") -> Func: """ Trim the spaces from left end for the specified string value. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6088,16 +5478,10 @@ def ltrim(col: "AbstractColOrName") -> Func: raise NotImplementedError - def rtrim(col: "AbstractColOrName") -> Func: """ Trim the spaces from right end for the specified string value. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6123,16 +5507,10 @@ def rtrim(col: "AbstractColOrName") -> Func: raise NotImplementedError - def trim(col: "AbstractColOrName") -> Func: """ Trim the spaces from both ends for the specified string AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6158,16 +5536,11 @@ def trim(col: "AbstractColOrName") -> Func: raise NotImplementedError - def concat_ws(sep: str, *cols: "AbstractColOrName") -> Func: """ Concatenates multiple input string AbstractCols together into a single string AbstractCol, using the given separator. - - - - Parameters ---------- @@ -6190,16 +5563,11 @@ def concat_ws(sep: str, *cols: "AbstractColOrName") -> Func: raise NotImplementedError - def decode(col: "AbstractColOrName", charset: str) -> Func: """ Computes the first argument into a string from a binary using the provided character set (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). - - - - Parameters ---------- @@ -6226,16 +5594,12 @@ def decode(col: "AbstractColOrName", charset: str) -> Func: raise NotImplementedError - def encode(col: "AbstractColOrName", charset: str) -> Func: """ Computes the first argument into a binary from a string using the provided character set (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). - - - Parameters ---------- @@ -6262,17 +5626,11 @@ def encode(col: "AbstractColOrName", charset: str) -> Func: raise NotImplementedError - def format_number(col: "AbstractColOrName", d: int) -> Func: """ Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places with HALF_EVEN round mode, and returns the result as a string. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6291,15 +5649,10 @@ def format_number(col: "AbstractColOrName", d: int) -> Func: raise NotImplementedError - def format_string(format: str, *cols: "AbstractColOrName") -> Func: """ Formats the arguments in printf-style and returns the result as a string AbstractCol. - - - - Parameters ---------- @@ -6322,16 +5675,11 @@ def format_string(format: str, *cols: "AbstractColOrName") -> Func: raise NotImplementedError - def instr(str: "AbstractColOrName", substr: str) -> Func: """ Locate the position of the first occurrence of substr AbstractCol in the given string. Returns null if either of the arguments are null. - - - - Notes ----- @@ -6359,7 +5707,6 @@ def instr(str: "AbstractColOrName", substr: str) -> Func: raise NotImplementedError - def overlay( src: "AbstractColOrName", replace: "AbstractColOrName", @@ -6370,11 +5717,6 @@ def overlay( Overlay the specified portion of `src` with `replace`, starting from byte position `pos` of `src` and proceeding for `len` bytes. - - - - - Parameters ---------- src : :class:`~osos.Col` or str @@ -6419,7 +5761,6 @@ def overlay( raise NotImplementedError - def sentences( string: "AbstractColOrName", language: Optional["AbstractColOrName"] = None, @@ -6429,10 +5770,10 @@ def sentences( Splits a string into arrays of sentences, where each sentence is an array of words. The 'language' and 'country' arguments are optional, and if omitted, the default locale is used. - - - + + + Parameters ---------- @@ -6473,17 +5814,12 @@ def sentences( raise NotImplementedError - def substring(str: "AbstractColOrName", pos: int, len: int) -> Func: """ Substring starts at `pos` and is of length `len` when str is String type or returns the slice of byte array that starts at `pos` in byte and is of length `len` when str is Binary type. - - - - Notes ----- @@ -6512,7 +5848,6 @@ def substring(str: "AbstractColOrName", pos: int, len: int) -> Func: raise NotImplementedError - def substring_index(str: "AbstractColOrName", delim: str, count: int) -> Func: """ Returns the substring from string str before count occurrences of the delimiter delim. @@ -6520,10 +5855,6 @@ def substring_index(str: "AbstractColOrName", delim: str, count: int) -> Func: returned. If count is negative, every to the right of the final delimiter (counting from the right) is returned. substring_index performs a case-sensitive match when searching for delim. - - - - Parameters ---------- @@ -6550,15 +5881,9 @@ def substring_index(str: "AbstractColOrName", delim: str, count: int) -> Func: raise NotImplementedError - def levenshtein(left: "AbstractColOrName", right: "AbstractColOrName") -> Func: """Computes the Levenshtein distance of the two given strings. - - - - - Parameters ---------- left : :class:`~osos.Col` or str @@ -6580,16 +5905,10 @@ def levenshtein(left: "AbstractColOrName", right: "AbstractColOrName") -> Func: raise NotImplementedError - def locate(substr: str, str: "AbstractColOrName", pos: int = 1) -> Func: """ Locate the position of the first occurrence of substr in a string AbstractCol, after position pos. - - - - - Parameters ---------- substr : str @@ -6618,16 +5937,10 @@ def locate(substr: str, str: "AbstractColOrName", pos: int = 1) -> Func: raise NotImplementedError - def lpad(col: "AbstractColOrName", len: int, pad: str) -> Func: """ Left-pad the string AbstractCol to width `len` with `pad`. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6651,15 +5964,10 @@ def lpad(col: "AbstractColOrName", len: int, pad: str) -> Func: raise NotImplementedError - def rpad(col: "AbstractColOrName", len: int, pad: str) -> Func: """ Right-pad the string AbstractCol to width `len` with `pad`. - - - - Parameters ---------- @@ -6684,16 +5992,10 @@ def rpad(col: "AbstractColOrName", len: int, pad: str) -> Func: raise NotImplementedError - def repeat(col: "AbstractColOrName", n: int) -> Func: """ Repeats a string AbstractCol n times, and returns it as a new string AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6715,16 +6017,10 @@ def repeat(col: "AbstractColOrName", n: int) -> Func: raise NotImplementedError - def split(str: "AbstractColOrName", pattern: str, limit: int = -1) -> Func: """ Splits str around matches of the given pattern. - - - - - Parameters ---------- str : :class:`~osos.Col` or str @@ -6741,7 +6037,7 @@ def split(str: "AbstractColOrName", pattern: str, limit: int = -1) -> Func: * ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting array can be of any size. - + `split` now takes an optional `limit` field. If not provided, default limit value is -1. Returns @@ -6760,16 +6056,10 @@ def split(str: "AbstractColOrName", pattern: str, limit: int = -1) -> Func: raise NotImplementedError - def regexp_extract(str: "AbstractColOrName", pattern: str, idx: int) -> Func: r"""Extract a specific group matched by a Java regex, from the specified string AbstractCol. If the regex did not match, or the specified group did not match, an empty string is returned. - - - - - Parameters ---------- str : :class:`~osos.Col` or str @@ -6799,7 +6089,6 @@ def regexp_extract(str: "AbstractColOrName", pattern: str, idx: int) -> Func: raise NotImplementedError - def regexp_replace( string: "AbstractColOrName", pattern: Union[str, AbstractCol], @@ -6807,11 +6096,6 @@ def regexp_replace( ) -> Func: r"""Replace all substrings of the specified string value that match regexp with replacement. - - - - - Parameters ---------- string : :class:`~osos.Col` or str @@ -6845,15 +6129,9 @@ def regexp_replace( raise NotImplementedError - def initcap(col: "AbstractColOrName") -> Func: """Translate the first letter of each word to upper case in the sentence. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6872,16 +6150,10 @@ def initcap(col: "AbstractColOrName") -> Func: raise NotImplementedError - def soundex(col: "AbstractColOrName") -> Func: """ Returns the SoundEx encoding for a string - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6901,15 +6173,9 @@ def soundex(col: "AbstractColOrName") -> Func: raise NotImplementedError - def bin(col: "AbstractColOrName") -> Func: """Returns the string representation of the binary value of the given AbstractCol. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6929,17 +6195,11 @@ def bin(col: "AbstractColOrName") -> Func: raise NotImplementedError - def hex(col: "AbstractColOrName") -> Func: """Computes hex value of the given AbstractCol, which could be :class:`pyspark.sql.types.StringType`, :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or :class:`pyspark.sql.types.LongType`. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6958,16 +6218,10 @@ def hex(col: "AbstractColOrName") -> Func: raise NotImplementedError - def unhex(col: "AbstractColOrName") -> Func: """Inverse of hex. Interprets each pair of characters as a hexadecimal number and converts to the byte representation of number. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -6986,17 +6240,11 @@ def unhex(col: "AbstractColOrName") -> Func: raise NotImplementedError - def length(col: "AbstractColOrName") -> Func: """Computes the character length of string data or number of bytes of binary data. The length of character data includes the trailing spaces. The length of binary data includes binary zeros. - - - - - Parameters ---------- col : :class:`~osos.Col` or str @@ -7015,7 +6263,6 @@ def length(col: "AbstractColOrName") -> Func: raise NotImplementedError - def octet_length(col: "AbstractColOrName") -> Func: """ Calculates the byte length for the specified string AbstractCol. @@ -7040,7 +6287,6 @@ def octet_length(col: "AbstractColOrName") -> Func: raise NotImplementedError - def bit_length(col: "AbstractColOrName") -> Func: """ Calculates the bit length for the specified string AbstractCol. @@ -7066,18 +6312,12 @@ def bit_length(col: "AbstractColOrName") -> Func: raise NotImplementedError - def translate(srcCol: "AbstractColOrName", matching: str, replace: str) -> Func: """A function translate any character in the `srcCol` by a character in `matching`. The characters in `replace` is corresponding to the characters in `matching`. Translation will happen whenever any character in the string is matching with the character in the `matching`. - - - - - Parameters ---------- srcCol : :class:`~osos.Col` or str diff --git a/osos/indexer.py b/osos/indexer.py index 6621d8f..cdbb93e 100644 --- a/osos/indexer.py +++ b/osos/indexer.py @@ -1,7 +1,7 @@ from pandas.api.indexers import BaseIndexer import pandas as pd import numpy as np -from typing import Union,Tuple +from typing import Union, Tuple from .window import currentRow, unboundedFollowing, unboundedPreceding diff --git a/osos/types.py b/osos/types.py index efff6d4..5c52136 100644 --- a/osos/types.py +++ b/osos/types.py @@ -4,7 +4,7 @@ import pandas as pd from decimal import Decimal -from typing import List,Tuple,Dict +from typing import List, Tuple, Dict # integral IntegerType = lambda: np.int32 @@ -39,8 +39,4 @@ ArrayType = lambda elementType, containsNull=False: List MapType = lambda keyType, elementType, valueContainsNull=False: Dict StructType = lambda fields: np.array -StructField = lambda name,dataType,nullable=False: Tuple - - - - +StructField = lambda name, dataType, nullable=False: Tuple diff --git a/osos/utils.py b/osos/utils.py index aa6316b..ce72d54 100644 --- a/osos/utils.py +++ b/osos/utils.py @@ -1,13 +1,13 @@ from __future__ import annotations import itertools -from typing import Iterable, Union,cast,List +from typing import Iterable, Union, cast, List from osos.exceptions import OsosTypeError import pandas as pd from numbers import Number from pandas.core.groupby.generic import SeriesGroupBy -from .column import Node, AbstractCol,AbstractLit +from .column import Node, AbstractCol, AbstractLit SeriesType = Union[pd.Series, SeriesGroupBy] @@ -37,48 +37,54 @@ def transform_or_noop(col): return list(itertools.chain.from_iterable(cols)) -def flatten_and_process_cols_or_lits(cols: Iterable[str|Node|Number]) -> Iterable[Node]: + +def flatten_and_process_cols_or_lits( + cols: Iterable[str | Node | Number], +) -> Iterable[Node]: flat_cols = flatten_cols(cols) - for i,col in enumerate(flat_cols): + for i, col in enumerate(flat_cols): if isinstance(col, str): flat_cols[i] = AbstractCol(col) elif isinstance(col, Number): flat_cols[i] = AbstractLit(col) - elif isinstance(col,Node): + elif isinstance(col, Node): pass else: raise OsosTypeError(f"Incorrect type: {type(col)}") - return cast(List[Node],flat_cols) + return cast(List[Node], flat_cols) + -def flatten_and_process_cols(cols: Iterable[str|Node]) -> Iterable[Node]: +def flatten_and_process_cols(cols: Iterable[str | Node]) -> Iterable[Node]: flat_cols = flatten_cols(cols) - for i,col in enumerate(flat_cols): + for i, col in enumerate(flat_cols): if isinstance(col, str): flat_cols[i] = AbstractCol(col) - elif isinstance(col,Node): + elif isinstance(col, Node): pass else: raise OsosTypeError(f"Incorrect type: {type(col)}") - return cast(List[Node],flat_cols) + return cast(List[Node], flat_cols) -def process_one_col_or_lit(col: str|Node|Number) -> Node: + +def process_one_col_or_lit(col: str | Node | Number) -> Node: if isinstance(col, str): col = AbstractCol(col) elif isinstance(col, Number): col = AbstractLit(col) - elif isinstance(col,Node): + elif isinstance(col, Node): pass else: raise OsosTypeError(f"Incorrect type: {type(col)}") return col -def process_one_col(col: str|Node) -> Node: + +def process_one_col(col: str | Node) -> Node: if isinstance(col, str): col = AbstractCol(col) - elif isinstance(col,Node): + elif isinstance(col, Node): pass else: raise OsosTypeError(f"Incorrect type: {type(col)}") diff --git a/osos/window.py b/osos/window.py index 8fe596f..db300b3 100644 --- a/osos/window.py +++ b/osos/window.py @@ -1,6 +1,6 @@ from .exceptions import AnalysisException from .utils import flatten_cols -from typing import Union,List +from typing import Union, List from .column import Node, AbstractCol, SimpleContainer, ColumnList, ForwardRef @@ -178,4 +178,4 @@ def __bool__(self): unboundedFollowing: int = _JAVA_MAX_LONG MAX_WINDOW_SIZE: int = unboundedFollowing - unboundedPreceding -currentRow: int = 0 \ No newline at end of file +currentRow: int = 0 diff --git a/setup.py b/setup.py index 427c8c9..b8a0bb7 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,3 @@ from setuptools import setup, find_packages -setup( - packages=find_packages(), - python_requires=">=3.7" -) +setup(packages=find_packages(), python_requires=">=3.7") diff --git a/tests/auto_t_est.py b/tests/auto_t_est.py index ad79ab2..19fc1eb 100644 --- a/tests/auto_t_est.py +++ b/tests/auto_t_est.py @@ -4,711 +4,800 @@ column = col -#osos.functions.col: -col('x') -column('x') +# osos.functions.col: +col("x") +column("x") -#osos.functions.column: -col('x') -column('x') +# osos.functions.column: +col("x") +column("x") -#osos.functions.lit: +# osos.functions.lit: df = OsosSession.range(1) -df.select(lit(5).alias('height'), df.id).show() +df.select(lit(5).alias("height"), df.id).show() OsosSession.range(1).select(lit([1, 2, 3])).show() -#osos.functions.broadcast: +# osos.functions.broadcast: from osos import types + df = OsosSession.createDataFrame([1, 2, 3, 3, 4], types.IntegerType()) df_small = OsosSession.range(3) df_b = broadcast(df_small) df.join(df_b, df.value == df_small.id).show() -#osos.functions.coalesce: +# osos.functions.coalesce: cDf = OsosSession.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) cDf.show() cDf.select(coalesce(cDf["a"], cDf["b"])).show() -cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() +cDf.select("*", coalesce(cDf["a"], lit(0.0))).show() -#osos.functions.input_file_name: +# osos.functions.input_file_name: import os + path = os.path.abspath(__file__) df = OsosSession.read.text(path) df.select(input_file_name()).first() -#osos.functions.isnan: -df = OsosSession.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) +# osos.functions.isnan: +df = OsosSession.createDataFrame([(1.0, float("nan")), (float("nan"), 2.0)], ("a", "b")) df.select("a", "b", isnan("a").alias("r1"), isnan(df.b).alias("r2")).show() -#osos.functions.isnull: +# osos.functions.isnull: df = OsosSession.createDataFrame([(1, None), (None, 2)], ("a", "b")) df.select("a", "b", isnull("a").alias("r1"), isnull(df.b).alias("r2")).show() -#osos.functions.monotonically_increasing_id: -df0 = DataFrame(pd.DataFrame({"a":[1,2,3]})) -df0.select(monotonically_increasing_id().alias('id')).collect() +# osos.functions.monotonically_increasing_id: +df0 = DataFrame(pd.DataFrame({"a": [1, 2, 3]})) +df0.select(monotonically_increasing_id().alias("id")).collect() -#osos.functions.nanvl: -df = OsosSession.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) +# osos.functions.nanvl: +df = OsosSession.createDataFrame([(1.0, float("nan")), (float("nan"), 2.0)], ("a", "b")) df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect() -#osos.functions.rand: +# osos.functions.rand: df = OsosSession.range(2) -df.withColumn('rand', rand(seed=42) * 3).show() +df.withColumn("rand", rand(seed=42) * 3).show() -#osos.functions.randn: +# osos.functions.randn: df = OsosSession.range(2) -df.withColumn('randn', randn(seed=42)).show() +df.withColumn("randn", randn(seed=42)).show() -#osos.functions.spark_partition_id: +# osos.functions.spark_partition_id: df = OsosSession.range(2) df.repartition(1).select(spark_partition_id().alias("pid")).collect() -#osos.functions.when: +# osos.functions.when: df = OsosSession.range(3) -df.select(when(df['id'] == 2, 3).otherwise(4).alias("age")).show() +df.select(when(df["id"] == 2, 3).otherwise(4).alias("age")).show() df.select(when(df.id == 2, df.id + 1).alias("age")).show() -#osos.functions.bitwise_not: +# osos.functions.bitwise_not: df = OsosSession.range(1) df.select(bitwise_not(lit(0))).show() df.select(bitwise_not(lit(1))).show() -#osos.functions.bitwiseNOT: -#osos.functions.expr: +# osos.functions.bitwiseNOT: +# osos.functions.expr: df = OsosSession.createDataFrame([["Alice"], ["Bob"]], ["name"]) df.select("name", expr("length(name)")).show() -#osos.functions.greatest: -df = OsosSession.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) +# osos.functions.greatest: +df = OsosSession.createDataFrame([(1, 4, 3)], ["a", "b", "c"]) df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect() -#osos.functions.least: -df = OsosSession.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) +# osos.functions.least: +df = OsosSession.createDataFrame([(1, 4, 3)], ["a", "b", "c"]) df.select(least(df.a, df.b, df.c).alias("least")).collect() -#osos.functions.sqrt: +# osos.functions.sqrt: df = OsosSession.range(1) df.select(sqrt(lit(4))).show() -#osos.functions.abs: +# osos.functions.abs: df = OsosSession.range(1) df.select(abs(lit(-1))).show() -#osos.functions.acos: +# osos.functions.acos: df = OsosSession.range(1, 3) df.select(acos(df.id)).show() -#osos.functions.acosh: +# osos.functions.acosh: df = OsosSession.range(2) df.select(acosh(col("id"))).show() -#osos.functions.asin: +# osos.functions.asin: df = OsosSession.createDataFrame([(0,), (2,)]) df.select(asin(df.schema.fieldNames()[0])).show() -#osos.functions.asinh: +# osos.functions.asinh: df = OsosSession.range(1) df.select(asinh(col("id"))).show() -#osos.functions.atan: +# osos.functions.atan: df = OsosSession.range(1) df.select(atan(df.id)).show() -#osos.functions.atanh: +# osos.functions.atanh: df = OsosSession.createDataFrame([(0,), (2,)], schema=["numbers"]) df.select(atanh(df["numbers"])).show() -#osos.functions.atan2: +# osos.functions.atan2: df = OsosSession.range(1) df.select(atan2(lit(1), lit(2))).first() -#osos.functions.bin: -df = OsosSession.createDataFrame([2,5], "INT") -df.select(bin(df.value).alias('c')).collect() +# osos.functions.bin: +df = OsosSession.createDataFrame([2, 5], "INT") +df.select(bin(df.value).alias("c")).collect() -#osos.functions.cbrt: +# osos.functions.cbrt: df = OsosSession.range(1) df.select(cbrt(lit(27))).show() -#osos.functions.ceil: +# osos.functions.ceil: df = OsosSession.range(1) df.select(ceil(lit(-0.1))).show() -#osos.functions.conv: -df = OsosSession.createDataFrame([("010101",)], ['n']) -df.select(conv(df.n, 2, 16).alias('hex')).collect() +# osos.functions.conv: +df = OsosSession.createDataFrame([("010101",)], ["n"]) +df.select(conv(df.n, 2, 16).alias("hex")).collect() -#osos.functions.cos: +# osos.functions.cos: import math + df = OsosSession.range(1) df.select(cos(lit(math.pi))).first() -#osos.functions.cosh: +# osos.functions.cosh: df = OsosSession.range(1) df.select(cosh(lit(1))).first() -#osos.functions.cot: +# osos.functions.cot: import math + df = OsosSession.range(1) df.select(cot(lit(math.radians(45)))).first() -#osos.functions.csc: +# osos.functions.csc: import math + df = OsosSession.range(1) df.select(csc(lit(math.radians(90)))).first() -#osos.functions.exp: +# osos.functions.exp: df = OsosSession.range(1) df.select(exp(lit(0))).show() -#osos.functions.expm1: +# osos.functions.expm1: df = OsosSession.range(1) df.select(expm1(lit(1))).first() -#osos.functions.factorial: -df = OsosSession.createDataFrame([(5,)], ['n']) -df.select(factorial(df.n).alias('f')).collect() +# osos.functions.factorial: +df = OsosSession.createDataFrame([(5,)], ["n"]) +df.select(factorial(df.n).alias("f")).collect() -#osos.functions.floor: +# osos.functions.floor: df = OsosSession.range(1) df.select(floor(lit(2.5))).show() -#osos.functions.hex: -OsosSession.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() +# osos.functions.hex: +OsosSession.createDataFrame([("ABC", 3)], ["a", "b"]).select( + hex("a"), hex("b") +).collect() -#osos.functions.unhex: -OsosSession.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect() +# osos.functions.unhex: +OsosSession.createDataFrame([("414243",)], ["a"]).select(unhex("a")).collect() -#osos.functions.hypot: +# osos.functions.hypot: df = OsosSession.range(1) df.select(hypot(lit(1), lit(2))).first() -#osos.functions.log: +# osos.functions.log: df = OsosSession.createDataFrame([10, 100, 1000], "INT") -df.select(log(10.0, df.value).alias('ten')).show() +df.select(log(10.0, df.value).alias("ten")).show() df.select(log(df.value)).show() -#osos.functions.log10: +# osos.functions.log10: df = OsosSession.range(1) df.select(log10(lit(100))).show() -#osos.functions.log1p: +# osos.functions.log1p: import math + df = OsosSession.range(1) df.select(log1p(lit(math.e))).first() -df.select(log(lit(math.e+1))).first() +df.select(log(lit(math.e + 1))).first() -#osos.functions.log2: -df = OsosSession.createDataFrame([(4,)], ['a']) -df.select(log2('a').alias('log2')).show() +# osos.functions.log2: +df = OsosSession.createDataFrame([(4,)], ["a"]) +df.select(log2("a").alias("log2")).show() -#osos.functions.pmod: -df = OsosSession.createDataFrame([ -(1.0, float('nan')), (float('nan'), 2.0), (10.0, 3.0), -(float('nan'), float('nan')), (-3.0, 4.0), (-10.0, 3.0), -(-5.0, -6.0), (7.0, -8.0), (1.0, 2.0)], -("a", "b")) +# osos.functions.pmod: +df = OsosSession.createDataFrame( + [ + (1.0, float("nan")), + (float("nan"), 2.0), + (10.0, 3.0), + (float("nan"), float("nan")), + (-3.0, 4.0), + (-10.0, 3.0), + (-5.0, -6.0), + (7.0, -8.0), + (1.0, 2.0), + ], + ("a", "b"), +) df.select(pmod("a", "b")).show() -#osos.functions.pow: +# osos.functions.pow: df = OsosSession.range(1) df.select(pow(lit(3), lit(2))).first() -#osos.functions.rint: +# osos.functions.rint: df = OsosSession.range(1) df.select(rint(lit(10.6))).show() df.select(rint(lit(10.3))).show() -#osos.functions.round: -OsosSession.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect() +# osos.functions.round: +OsosSession.createDataFrame([(2.5,)], ["a"]).select(round("a", 0).alias("r")).collect() -#osos.functions.bround: -OsosSession.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect() +# osos.functions.bround: +OsosSession.createDataFrame([(2.5,)], ["a"]).select(bround("a", 0).alias("r")).collect() -#osos.functions.sec: +# osos.functions.sec: df = OsosSession.range(1) df.select(sec(lit(1.5))).first() -#osos.functions.shiftleft: -OsosSession.createDataFrame([(21,)], ['a']).select(shiftleft('a', 1).alias('r')).collect() +# osos.functions.shiftleft: +OsosSession.createDataFrame([(21,)], ["a"]).select( + shiftleft("a", 1).alias("r") +).collect() -#osos.functions.shiftright: -OsosSession.createDataFrame([(42,)], ['a']).select(shiftright('a', 1).alias('r')).collect() +# osos.functions.shiftright: +OsosSession.createDataFrame([(42,)], ["a"]).select( + shiftright("a", 1).alias("r") +).collect() -#osos.functions.shiftrightunsigned: -df = OsosSession.createDataFrame([(-42,)], ['a']) -df.select(shiftrightunsigned('a', 1).alias('r')).collect() +# osos.functions.shiftrightunsigned: +df = OsosSession.createDataFrame([(-42,)], ["a"]) +df.select(shiftrightunsigned("a", 1).alias("r")).collect() -#osos.functions.signum: +# osos.functions.signum: df = OsosSession.range(1) df.select(signum(lit(-5))).show() df.select(signum(lit(6))).show() -#osos.functions.sin: +# osos.functions.sin: import math + df = OsosSession.range(1) df.select(sin(lit(math.radians(90)))).first() -#osos.functions.sinh: +# osos.functions.sinh: df = OsosSession.range(1) df.select(sinh(lit(1.1))).first() -#osos.functions.tan: +# osos.functions.tan: import math + df = OsosSession.range(1) df.select(tan(lit(math.radians(45)))).first() -#osos.functions.tanh: +# osos.functions.tanh: import math + df = OsosSession.range(1) df.select(tanh(lit(math.radians(90)))).first() -#osos.functions.toDegrees: -#osos.functions.degrees: +# osos.functions.toDegrees: +# osos.functions.degrees: import math + df = OsosSession.range(1) df.select(degrees(lit(math.pi))).first() -#osos.functions.toRadians: -#osos.functions.radians: +# osos.functions.toRadians: +# osos.functions.radians: df = OsosSession.range(1) df.select(radians(lit(180))).first() -#osos.functions.add_months: -df = OsosSession.createDataFrame([('2015-04-08', 2)], ['dt', 'add']) -df.select(add_months(df.dt, 1).alias('next_month')).collect() -df.select(add_months(df.dt, df.add.cast('integer')).alias('next_month')).collect() -df.select(add_months('dt', -2).alias('prev_month')).collect() +# osos.functions.add_months: +df = OsosSession.createDataFrame([("2015-04-08", 2)], ["dt", "add"]) +df.select(add_months(df.dt, 1).alias("next_month")).collect() +df.select(add_months(df.dt, df.add.cast("integer")).alias("next_month")).collect() +df.select(add_months("dt", -2).alias("prev_month")).collect() -#osos.functions.current_date: +# osos.functions.current_date: df = OsosSession.range(1) df.select(current_date()).show() -#osos.functions.current_timestamp: +# osos.functions.current_timestamp: df = OsosSession.range(1) df.select(current_timestamp()).show(truncate=False) -#osos.functions.date_add: -df = OsosSession.createDataFrame([('2015-04-08', 2,)], ['dt', 'add']) -df.select(date_add(df.dt, 1).alias('next_date')).collect() -df.select(date_add(df.dt, df.add.cast('integer')).alias('next_date')).collect() -df.select(date_add('dt', -1).alias('prev_date')).collect() +# osos.functions.date_add: +df = OsosSession.createDataFrame( + [ + ( + "2015-04-08", + 2, + ) + ], + ["dt", "add"], +) +df.select(date_add(df.dt, 1).alias("next_date")).collect() +df.select(date_add(df.dt, df.add.cast("integer")).alias("next_date")).collect() +df.select(date_add("dt", -1).alias("prev_date")).collect() -#osos.functions.date_format: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect() +# osos.functions.date_format: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(date_format("dt", "MM/dd/yyy").alias("date")).collect() -#osos.functions.date_sub: -df = OsosSession.createDataFrame([('2015-04-08', 2,)], ['dt', 'sub']) -df.select(date_sub(df.dt, 1).alias('prev_date')).collect() -df.select(date_sub(df.dt, df.sub.cast('integer')).alias('prev_date')).collect() -df.select(date_sub('dt', -1).alias('next_date')).collect() +# osos.functions.date_sub: +df = OsosSession.createDataFrame( + [ + ( + "2015-04-08", + 2, + ) + ], + ["dt", "sub"], +) +df.select(date_sub(df.dt, 1).alias("prev_date")).collect() +df.select(date_sub(df.dt, df.sub.cast("integer")).alias("prev_date")).collect() +df.select(date_sub("dt", -1).alias("next_date")).collect() -#osos.functions.date_trunc: -df = OsosSession.createDataFrame([('1997-02-28 05:02:11',)], ['t']) -df.select(date_trunc('year', df.t).alias('year')).collect() -df.select(date_trunc('mon', df.t).alias('month')).collect() +# osos.functions.date_trunc: +df = OsosSession.createDataFrame([("1997-02-28 05:02:11",)], ["t"]) +df.select(date_trunc("year", df.t).alias("year")).collect() +df.select(date_trunc("mon", df.t).alias("month")).collect() -#osos.functions.datediff: -df = OsosSession.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2']) -df.select(datediff(df.d2, df.d1).alias('diff')).collect() +# osos.functions.datediff: +df = OsosSession.createDataFrame([("2015-04-08", "2015-05-10")], ["d1", "d2"]) +df.select(datediff(df.d2, df.d1).alias("diff")).collect() -#osos.functions.dayofmonth: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(dayofmonth('dt').alias('day')).collect() +# osos.functions.dayofmonth: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(dayofmonth("dt").alias("day")).collect() -#osos.functions.dayofweek: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(dayofweek('dt').alias('day')).collect() +# osos.functions.dayofweek: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(dayofweek("dt").alias("day")).collect() -#osos.functions.dayofyear: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(dayofyear('dt').alias('day')).collect() +# osos.functions.dayofyear: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(dayofyear("dt").alias("day")).collect() -#osos.functions.second: +# osos.functions.second: import datetime -df = OsosSession.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) -df.select(second('ts').alias('second')).collect() -#osos.functions.weekofyear: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(weekofyear(df.dt).alias('week')).collect() +df = OsosSession.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ["ts"]) +df.select(second("ts").alias("second")).collect() + +# osos.functions.weekofyear: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(weekofyear(df.dt).alias("week")).collect() -#osos.functions.year: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(year('dt').alias('year')).collect() +# osos.functions.year: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(year("dt").alias("year")).collect() -#osos.functions.quarter: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(quarter('dt').alias('quarter')).collect() +# osos.functions.quarter: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(quarter("dt").alias("quarter")).collect() -#osos.functions.month: -df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -df.select(month('dt').alias('month')).collect() +# osos.functions.month: +df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +df.select(month("dt").alias("month")).collect() -#osos.functions.last_day: -df = OsosSession.createDataFrame([('1997-02-10',)], ['d']) -df.select(last_day(df.d).alias('date')).collect() +# osos.functions.last_day: +df = OsosSession.createDataFrame([("1997-02-10",)], ["d"]) +df.select(last_day(df.d).alias("date")).collect() -#osos.functions.localtimestamp: +# osos.functions.localtimestamp: df = OsosSession.range(1) df.select(localtimestamp()).show(truncate=False) -#osos.functions.minute: +# osos.functions.minute: import datetime -df = OsosSession.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) -df.select(minute('ts').alias('minute')).collect() -#osos.functions.months_between: -df = OsosSession.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) -df.select(months_between(df.date1, df.date2).alias('months')).collect() -df.select(months_between(df.date1, df.date2, False).alias('months')).collect() +df = OsosSession.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ["ts"]) +df.select(minute("ts").alias("minute")).collect() -#osos.functions.next_day: -df = OsosSession.createDataFrame([('2015-07-27',)], ['d']) -df.select(next_day(df.d, 'Sun').alias('date')).collect() +# osos.functions.months_between: +df = OsosSession.createDataFrame( + [("1997-02-28 10:30:00", "1996-10-30")], ["date1", "date2"] +) +df.select(months_between(df.date1, df.date2).alias("months")).collect() +df.select(months_between(df.date1, df.date2, False).alias("months")).collect() + +# osos.functions.next_day: +df = OsosSession.createDataFrame([("2015-07-27",)], ["d"]) +df.select(next_day(df.d, "Sun").alias("date")).collect() -#osos.functions.hour: +# osos.functions.hour: import datetime -df = OsosSession.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) -df.select(hour('ts').alias('hour')).collect() -#osos.functions.make_date: -df = OsosSession.createDataFrame([(2020, 6, 26)], ['Y', 'M', 'D']) +df = OsosSession.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ["ts"]) +df.select(hour("ts").alias("hour")).collect() + +# osos.functions.make_date: +df = OsosSession.createDataFrame([(2020, 6, 26)], ["Y", "M", "D"]) df.select(make_date(df.Y, df.M, df.D).alias("datefield")).collect() -#osos.functions.from_unixtime: +# osos.functions.from_unixtime: OsosSession.conf.set("OsosSession.sql.session.timeZone", "America/Los_Angeles") -time_df = OsosSession.createDataFrame([(1428476400,)], ['unix_time']) -time_df.select(from_unixtime('unix_time').alias('ts')).collect() +time_df = OsosSession.createDataFrame([(1428476400,)], ["unix_time"]) +time_df.select(from_unixtime("unix_time").alias("ts")).collect() OsosSession.conf.unset("OsosSession.sql.session.timeZone") -#osos.functions.unix_timestamp: +# osos.functions.unix_timestamp: OsosSession.conf.set("OsosSession.sql.session.timeZone", "America/Los_Angeles") -time_df = OsosSession.createDataFrame([('2015-04-08',)], ['dt']) -time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect() +time_df = OsosSession.createDataFrame([("2015-04-08",)], ["dt"]) +time_df.select(unix_timestamp("dt", "yyyy-MM-dd").alias("unix_time")).collect() OsosSession.conf.unset("OsosSession.sql.session.timeZone") -#osos.functions.to_timestamp: -df = OsosSession.createDataFrame([('1997-02-28 10:30:00',)], ['t']) -df.select(to_timestamp(df.t).alias('dt')).collect() +# osos.functions.to_timestamp: +df = OsosSession.createDataFrame([("1997-02-28 10:30:00",)], ["t"]) +df.select(to_timestamp(df.t).alias("dt")).collect() -df = OsosSession.createDataFrame([('1997-02-28 10:30:00',)], ['t']) -df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect() +df = OsosSession.createDataFrame([("1997-02-28 10:30:00",)], ["t"]) +df.select(to_timestamp(df.t, "yyyy-MM-dd HH:mm:ss").alias("dt")).collect() -#osos.functions.to_date: -df = OsosSession.createDataFrame([('1997-02-28 10:30:00',)], ['t']) -df.select(to_date(df.t).alias('date')).collect() +# osos.functions.to_date: +df = OsosSession.createDataFrame([("1997-02-28 10:30:00",)], ["t"]) +df.select(to_date(df.t).alias("date")).collect() -df = OsosSession.createDataFrame([('1997-02-28 10:30:00',)], ['t']) -df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect() +df = OsosSession.createDataFrame([("1997-02-28 10:30:00",)], ["t"]) +df.select(to_date(df.t, "yyyy-MM-dd HH:mm:ss").alias("date")).collect() -#osos.functions.trunc: -df = OsosSession.createDataFrame([('1997-02-28',)], ['d']) -df.select(trunc(df.d, 'year').alias('year')).collect() -df.select(trunc(df.d, 'mon').alias('month')).collect() +# osos.functions.trunc: +df = OsosSession.createDataFrame([("1997-02-28",)], ["d"]) +df.select(trunc(df.d, "year").alias("year")).collect() +df.select(trunc(df.d, "mon").alias("month")).collect() -#osos.functions.from_utc_timestamp: -df = OsosSession.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) -df.select(from_utc_timestamp(df.ts, "PST").alias('local_time')).collect() -df.select(from_utc_timestamp(df.ts, df.tz).alias('local_time')).collect() +# osos.functions.from_utc_timestamp: +df = OsosSession.createDataFrame([("1997-02-28 10:30:00", "JST")], ["ts", "tz"]) +df.select(from_utc_timestamp(df.ts, "PST").alias("local_time")).collect() +df.select(from_utc_timestamp(df.ts, df.tz).alias("local_time")).collect() -#osos.functions.to_utc_timestamp: -df = OsosSession.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) -df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect() -df.select(to_utc_timestamp(df.ts, df.tz).alias('utc_time')).collect() +# osos.functions.to_utc_timestamp: +df = OsosSession.createDataFrame([("1997-02-28 10:30:00", "JST")], ["ts", "tz"]) +df.select(to_utc_timestamp(df.ts, "PST").alias("utc_time")).collect() +df.select(to_utc_timestamp(df.ts, df.tz).alias("utc_time")).collect() -#osos.functions.window: +# osos.functions.window: import datetime + df = OsosSession.createDataFrame( -[(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], + [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ).toDF("date", "val") w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) -w.select(w.window.start.cast("string").alias("start"), -w.window.end.cast("string").alias("end"), "sum").collect() +w.select( + w.window.start.cast("string").alias("start"), + w.window.end.cast("string").alias("end"), + "sum", +).collect() -#osos.functions.session_window: +# osos.functions.session_window: df = OsosSession.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val") w = df.groupBy(session_window("date", "5 seconds")).agg(sum("val").alias("sum")) -w.select(w.session_window.start.cast("string").alias("start"), -w.session_window.end.cast("string").alias("end"), "sum").collect() +w.select( + w.session_window.start.cast("string").alias("start"), + w.session_window.end.cast("string").alias("end"), + "sum", +).collect() w = df.groupBy(session_window("date", lit("5 seconds"))).agg(sum("val").alias("sum")) -w.select(w.session_window.start.cast("string").alias("start"), -w.session_window.end.cast("string").alias("end"), "sum").collect() +w.select( + w.session_window.start.cast("string").alias("start"), + w.session_window.end.cast("string").alias("end"), + "sum", +).collect() -#osos.functions.timestamp_seconds: +# osos.functions.timestamp_seconds: from osos.functions import timestamp_seconds + OsosSession.conf.set("OsosSession.sql.session.timeZone", "UTC") -time_df = OsosSession.createDataFrame([(1230219000,)], ['unix_time']) -time_df.select(timestamp_seconds(time_df.unix_time).alias('ts')).show() -time_df.select(timestamp_seconds('unix_time').alias('ts')).printSchema() +time_df = OsosSession.createDataFrame([(1230219000,)], ["unix_time"]) +time_df.select(timestamp_seconds(time_df.unix_time).alias("ts")).show() +time_df.select(timestamp_seconds("unix_time").alias("ts")).printSchema() OsosSession.conf.unset("OsosSession.sql.session.timeZone") -#osos.functions.window_time: +# osos.functions.window_time: import datetime + df = OsosSession.createDataFrame( -[(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], + [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ).toDF("date", "val") w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) w.select( -w.window.end.cast("string").alias("end"), -window_time(w.window).cast("string").alias("window_time"), -"sum" + w.window.end.cast("string").alias("end"), + window_time(w.window).cast("string").alias("window_time"), + "sum", ).collect() -#osos.functions.array: +# osos.functions.array: df = OsosSession.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) -df.select(array('age', 'age').alias("arr")).collect() +df.select(array("age", "age").alias("arr")).collect() df.select(array([df.age, df.age]).alias("arr")).collect() -df.select(array('age', 'age').alias("col")).printSchema() +df.select(array("age", "age").alias("col")).printSchema() -#osos.functions.array_contains: -df = OsosSession.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) +# osos.functions.array_contains: +df = OsosSession.createDataFrame([(["a", "b", "c"],), ([],)], ["data"]) df.select(array_contains(df.data, "a")).collect() df.select(array_contains(df.data, lit("a"))).collect() -#osos.functions.arrays_overlap: -df = OsosSession.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ['x', 'y']) +# osos.functions.arrays_overlap: +df = OsosSession.createDataFrame( + [(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ["x", "y"] +) df.select(arrays_overlap(df.x, df.y).alias("overlap")).collect() -#osos.functions.array_join: -df = OsosSession.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data']) +# osos.functions.array_join: +df = OsosSession.createDataFrame([(["a", "b", "c"],), (["a", None],)], ["data"]) df.select(array_join(df.data, ",").alias("joined")).collect() df.select(array_join(df.data, ",", "NULL").alias("joined")).collect() -#osos.functions.create_map: +# osos.functions.create_map: df = OsosSession.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) -df.select(create_map('name', 'age').alias("map")).collect() +df.select(create_map("name", "age").alias("map")).collect() df.select(create_map([df.name, df.age]).alias("map")).collect() -#osos.functions.slice: -df = OsosSession.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) +# osos.functions.slice: +df = OsosSession.createDataFrame([([1, 2, 3],), ([4, 5],)], ["x"]) df.select(slice(df.x, 2, 2).alias("sliced")).collect() -#osos.functions.concat: -df = OsosSession.createDataFrame([('abcd','123')], ['s', 'd']) -df = df.select(concat(df.s, df.d).alias('s')) +# osos.functions.concat: +df = OsosSession.createDataFrame([("abcd", "123")], ["s", "d"]) +df = df.select(concat(df.s, df.d).alias("s")) df.collect() df -df = OsosSession.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c']) +df = OsosSession.createDataFrame( + [([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ["a", "b", "c"] +) df = df.select(concat(df.a, df.b, df.c).alias("arr")) df.collect() df -#osos.functions.array_position: -df = OsosSession.createDataFrame([(["c", "b", "a"],), ([],)], ['data']) +# osos.functions.array_position: +df = OsosSession.createDataFrame([(["c", "b", "a"],), ([],)], ["data"]) df.select(array_position(df.data, "a")).collect() -#osos.functions.element_at: -df = OsosSession.createDataFrame([(["a", "b", "c"],)], ['data']) +# osos.functions.element_at: +df = OsosSession.createDataFrame([(["a", "b", "c"],)], ["data"]) df.select(element_at(df.data, 1)).collect() df.select(element_at(df.data, -1)).collect() -df = OsosSession.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) +df = OsosSession.createDataFrame([({"a": 1.0, "b": 2.0},)], ["data"]) df.select(element_at(df.data, lit("a"))).collect() -#osos.functions.array_append: +# osos.functions.array_append: from osos import Row + df = OsosSession.createDataFrame([Row(c1=["b", "a", "c"], c2="c")]) df.select(array_append(df.c1, df.c2)).collect() -df.select(array_append(df.c1, 'x')).collect() - -#osos.functions.array_sort: -df = OsosSession.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data']) -df.select(array_sort(df.data).alias('r')).collect() -df = OsosSession.createDataFrame([(["foo", "foobar", None, "bar"],),(["foo"],),([],)], ['data']) -df.select(array_sort( -"data", -lambda x, y: when(x.isNull() | y.isNull(), lit(0)).otherwise(length(y) - length(x)) -).alias("r")).collect() - -#osos.functions.array_insert: +df.select(array_append(df.c1, "x")).collect() + +# osos.functions.array_sort: +df = OsosSession.createDataFrame([([2, 1, None, 3],), ([1],), ([],)], ["data"]) +df.select(array_sort(df.data).alias("r")).collect() df = OsosSession.createDataFrame( -[(['a', 'b', 'c'], 2, 'd'), (['c', 'b', 'a'], -2, 'd')], -['data', 'pos', 'val'] + [(["foo", "foobar", None, "bar"],), (["foo"],), ([],)], ["data"] ) -df.select(array_insert(df.data, df.pos.cast('integer'), df.val).alias('data')).collect() -df.select(array_insert(df.data, 5, 'hello').alias('data')).collect() +df.select( + array_sort( + "data", + lambda x, y: when(x.isNull() | y.isNull(), lit(0)).otherwise( + length(y) - length(x) + ), + ).alias("r") +).collect() -#osos.functions.array_remove: -df = OsosSession.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ['data']) +# osos.functions.array_insert: +df = OsosSession.createDataFrame( + [(["a", "b", "c"], 2, "d"), (["c", "b", "a"], -2, "d")], ["data", "pos", "val"] +) +df.select(array_insert(df.data, df.pos.cast("integer"), df.val).alias("data")).collect() +df.select(array_insert(df.data, 5, "hello").alias("data")).collect() + +# osos.functions.array_remove: +df = OsosSession.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ["data"]) df.select(array_remove(df.data, 1)).collect() -#osos.functions.array_distinct: -df = OsosSession.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data']) +# osos.functions.array_distinct: +df = OsosSession.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ["data"]) df.select(array_distinct(df.data)).collect() -#osos.functions.array_intersect: +# osos.functions.array_intersect: from osos import Row + df = OsosSession.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) df.select(array_intersect(df.c1, df.c2)).collect() -#osos.functions.array_union: +# osos.functions.array_union: from osos import Row + df = OsosSession.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) df.select(array_union(df.c1, df.c2)).collect() -#osos.functions.array_except: +# osos.functions.array_except: from osos import Row + df = OsosSession.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) df.select(array_except(df.c1, df.c2)).collect() -#osos.functions.array_compact: -df = OsosSession.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], ['data']) +# osos.functions.array_compact: +df = OsosSession.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], ["data"]) df.select(array_compact(df.data)).collect() -#osos.functions.transform: +# osos.functions.transform: df = OsosSession.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values")) df.select(transform("values", lambda x: x * 2).alias("doubled")).show() + def alternate(x, i): - return when(i % 2 == 0, x).otherwise(-x) + return when(i % 2 == 0, x).otherwise(-x) + df.select(transform("values", alternate).alias("alternated")).show() -#osos.functions.exists: -df = OsosSession.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],("key", "values")) +# osos.functions.exists: +df = OsosSession.createDataFrame( + [(1, [1, 2, 3, 4]), (2, [3, -1, 0])], ("key", "values") +) df.select(exists("values", lambda x: x < 0).alias("any_negative")).show() -#osos.functions.forall: +# osos.functions.forall: df = OsosSession.createDataFrame( -[(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])], -("key", "values") + [(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])], ("key", "values") ) df.select(forall("values", lambda x: x.rlike("foo")).alias("all_foo")).show() -#osos.functions.filter: +# osos.functions.filter: df = OsosSession.createDataFrame( -[(1, ["2018-09-20", "2019-02-03", "2019-07-01", "2020-06-01"])], -("key", "values") + [(1, ["2018-09-20", "2019-02-03", "2019-07-01", "2020-06-01"])], ("key", "values") ) + + def after_second_quarter(x): - return month(to_date(x)) > 6 + return month(to_date(x)) > 6 -df.select( -filter("values", after_second_quarter).alias("after_second_quarter") -).show(truncate=False) -#osos.functions.aggregate: +df.select(filter("values", after_second_quarter).alias("after_second_quarter")).show( + truncate=False +) + +# osos.functions.aggregate: df = OsosSession.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values")) df.select(aggregate("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show() + def merge(acc, x): count = acc.count + 1 sum = acc.sum + x return struct(count.alias("count"), sum.alias("sum")) - + + df.select( -aggregate( -"values", -struct(lit(0).alias("count"), lit(0.0).alias("sum")), -merge, -lambda acc: acc.sum / acc.count, -).alias("mean") + aggregate( + "values", + struct(lit(0).alias("count"), lit(0.0).alias("sum")), + merge, + lambda acc: acc.sum / acc.count, + ).alias("mean") ).show() -#osos.functions.zip_with: +# osos.functions.zip_with: df = OsosSession.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], ("id", "xs", "ys")) -df.select(zip_with("xs", "ys", lambda x, y: x ** y).alias("powers")).show(truncate=False) +df.select(zip_with("xs", "ys", lambda x, y: x**y).alias("powers")).show( + truncate=False +) df = OsosSession.createDataFrame([(1, ["foo", "bar"], [1, 2, 3])], ("id", "xs", "ys")) df.select(zip_with("xs", "ys", lambda x, y: concat_ws("_", x, y)).alias("xs_ys")).show() -#osos.functions.transform_keys: +# osos.functions.transform_keys: df = OsosSession.createDataFrame([(1, {"foo": -2.0, "bar": 2.0})], ("id", "data")) -row = df.select(transform_keys( -"data", lambda k, _: upper(k)).alias("data_upper") +row = df.select( + transform_keys("data", lambda k, _: upper(k)).alias("data_upper") ).head() sorted(row["data_upper"].items()) -#osos.functions.transform_values: -df = OsosSession.createDataFrame([(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data")) -row = df.select(transform_values( -"data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v) -).alias("new_data")).head() +# osos.functions.transform_values: +df = OsosSession.createDataFrame( + [(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data") +) +row = df.select( + transform_values( + "data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v) + ).alias("new_data") +).head() sorted(row["new_data"].items()) -#osos.functions.map_filter: -df = OsosSession.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data")) -row = df.select(map_filter( -"data", lambda _, v: v > 30.0).alias("data_filtered") -).head() +# osos.functions.map_filter: +df = OsosSession.createDataFrame( + [(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data") +) +row = df.select(map_filter("data", lambda _, v: v > 30.0).alias("data_filtered")).head() sorted(row["data_filtered"].items()) -#osos.functions.map_from_arrays: -df = OsosSession.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v']) +# osos.functions.map_from_arrays: +df = OsosSession.createDataFrame([([2, 5], ["a", "b"])], ["k", "v"]) df = df.select(map_from_arrays(df.k, df.v).alias("col")) df.show() df.printSchema() -#osos.functions.map_zip_with: -df = OsosSession.createDataFrame([ -(1, {"IT": 24.0, "SALES": 12.00}, {"IT": 2.0, "SALES": 1.4})], -("id", "base", "ratio") +# osos.functions.map_zip_with: +df = OsosSession.createDataFrame( + [(1, {"IT": 24.0, "SALES": 12.00}, {"IT": 2.0, "SALES": 1.4})], + ("id", "base", "ratio"), ) -row = df.select(map_zip_with( -"base", "ratio", lambda k, v1, v2: round(v1 * v2, 2)).alias("updated_data") +row = df.select( + map_zip_with("base", "ratio", lambda k, v1, v2: round(v1 * v2, 2)).alias( + "updated_data" + ) ).head() sorted(row["updated_data"].items()) -#osos.functions.explode: +# osos.functions.explode: from osos import Row -eDF = OsosSession.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) + +eDF = OsosSession.createDataFrame([Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]) eDF.select(explode(eDF.intlist).alias("anInt")).collect() eDF.select(explode(eDF.mapfield).alias("key", "value")).show() -#osos.functions.explode_outer: +# osos.functions.explode_outer: df = OsosSession.createDataFrame( -[(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], -("id", "an_array", "a_map") + [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], + ("id", "an_array", "a_map"), ) df.select("id", "an_array", explode_outer("a_map")).show() df.select("id", "a_map", explode_outer("an_array")).show() -#osos.functions.posexplode: +# osos.functions.posexplode: from osos import Row -eDF = OsosSession.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) + +eDF = OsosSession.createDataFrame([Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]) eDF.select(posexplode(eDF.intlist)).collect() eDF.select(posexplode(eDF.mapfield)).show() -#osos.functions.posexplode_outer: +# osos.functions.posexplode_outer: df = OsosSession.createDataFrame( -[(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], -("id", "an_array", "a_map") + [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], + ("id", "an_array", "a_map"), ) df.select("id", "an_array", posexplode_outer("a_map")).show() df.select("id", "a_map", posexplode_outer("an_array")).show() -#osos.functions.inline: +# osos.functions.inline: from osos import Row + df = OsosSession.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])]) df.select(inline(df.structlist)).show() -#osos.functions.inline_outer: +# osos.functions.inline_outer: from osos import Row -df = OsosSession.createDataFrame([ -Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]), -Row(id=2, structlist=[]) -]) -df.select('id', inline_outer(df.structlist)).show() - -#osos.functions.get: -df = OsosSession.createDataFrame([(["a", "b", "c"], 1)], ['data', 'index']) + +df = OsosSession.createDataFrame( + [Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]), Row(id=2, structlist=[])] +) +df.select("id", inline_outer(df.structlist)).show() + +# osos.functions.get: +df = OsosSession.createDataFrame([(["a", "b", "c"], 1)], ["data", "index"]) df.select(get(df.data, 1)).show() df.select(get(df.data, -1)).show() @@ -719,49 +808,54 @@ def merge(acc, x): df.select(get(df.data, col("index") - 1)).show() -#osos.functions.get_json_object: -data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] +# osos.functions.get_json_object: +data = [("1", """{"f1": "value1", "f2": "value2"}"""), ("2", """{"f1": "value12"}""")] df = OsosSession.createDataFrame(data, ("key", "jstring")) -df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \ -get_json_object(df.jstring, '$.f2').alias("c1") ).collect() +df.select( + df.key, + get_json_object(df.jstring, "$.f1").alias("c0"), + get_json_object(df.jstring, "$.f2").alias("c1"), +).collect() -#osos.functions.json_tuple: -data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] +# osos.functions.json_tuple: +data = [("1", """{"f1": "value1", "f2": "value2"}"""), ("2", """{"f1": "value12"}""")] df = OsosSession.createDataFrame(data, ("key", "jstring")) -df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect() +df.select(df.key, json_tuple(df.jstring, "f1", "f2")).collect() -#osos.functions.from_json: +# osos.functions.from_json: from osos.types import * -data = [(1, '''{"a": 1}''')] + +data = [(1, """{"a": 1}""")] schema = StructType([StructField("a", IntegerType())]) df = OsosSession.createDataFrame(data, ("key", "value")) df.select(from_json(df.value, schema).alias("json")).collect() df.select(from_json(df.value, "a INT").alias("json")).collect() df.select(from_json(df.value, "MAP").alias("json")).collect() -data = [(1, '''[{"a": 1}]''')] +data = [(1, """[{"a": 1}]""")] schema = ArrayType(StructType([StructField("a", IntegerType())])) df = OsosSession.createDataFrame(data, ("key", "value")) df.select(from_json(df.value, schema).alias("json")).collect() -schema = schema_of_json(lit('''{"a": 0}''')) +schema = schema_of_json(lit("""{"a": 0}""")) df.select(from_json(df.value, schema).alias("json")).collect() -data = [(1, '''[1, 2, 3]''')] +data = [(1, """[1, 2, 3]""")] schema = ArrayType(IntegerType()) df = OsosSession.createDataFrame(data, ("key", "value")) df.select(from_json(df.value, schema).alias("json")).collect() -#osos.functions.schema_of_json: +# osos.functions.schema_of_json: df = OsosSession.range(1) df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect() -schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'}) +schema = schema_of_json("{a: 1}", {"allowUnquotedFieldNames": "true"}) df.select(schema.alias("json")).collect() -#osos.functions.to_json: +# osos.functions.to_json: from osos import Row from osos.types import * -data = [(1, Row(age=2, name='Alice'))] + +data = [(1, Row(age=2, name="Alice"))] df = OsosSession.createDataFrame(data, ("key", "value")) df.select(to_json(df.value).alias("json")).collect() -data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])] +data = [(1, [Row(age=2, name="Alice"), Row(age=3, name="Bob")])] df = OsosSession.createDataFrame(data, ("key", "value")) df.select(to_json(df.value).alias("json")).collect() data = [(1, {"name": "Alice"})] @@ -774,94 +868,105 @@ def merge(acc, x): df = OsosSession.createDataFrame(data, ("key", "value")) df.select(to_json(df.value).alias("json")).collect() -#osos.functions.size: -df = OsosSession.createDataFrame([([1, 2, 3],),([1],),([],)], ['data']) +# osos.functions.size: +df = OsosSession.createDataFrame([([1, 2, 3],), ([1],), ([],)], ["data"]) df.select(size(df.data)).collect() -#osos.functions.struct: +# osos.functions.struct: df = OsosSession.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) -df.select(struct('age', 'name').alias("struct")).collect() +df.select(struct("age", "name").alias("struct")).collect() df.select(struct([df.age, df.name]).alias("struct")).collect() -#osos.functions.sort_array: -df = OsosSession.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data']) -df.select(sort_array(df.data).alias('r')).collect() -df.select(sort_array(df.data, asc=False).alias('r')).collect() +# osos.functions.sort_array: +df = OsosSession.createDataFrame([([2, 1, None, 3],), ([1],), ([],)], ["data"]) +df.select(sort_array(df.data).alias("r")).collect() +df.select(sort_array(df.data, asc=False).alias("r")).collect() -#osos.functions.array_max: -df = OsosSession.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) -df.select(array_max(df.data).alias('max')).collect() +# osos.functions.array_max: +df = OsosSession.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ["data"]) +df.select(array_max(df.data).alias("max")).collect() -#osos.functions.array_min: -df = OsosSession.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) -df.select(array_min(df.data).alias('min')).collect() +# osos.functions.array_min: +df = OsosSession.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ["data"]) +df.select(array_min(df.data).alias("min")).collect() -#osos.functions.shuffle: -df = OsosSession.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], ['data']) -df.select(shuffle(df.data).alias('s')).collect() +# osos.functions.shuffle: +df = OsosSession.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], ["data"]) +df.select(shuffle(df.data).alias("s")).collect() -#osos.functions.reverse: -df = OsosSession.createDataFrame([('Spark SQL',)], ['data']) -df.select(reverse(df.data).alias('s')).collect() -df = OsosSession.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data']) -df.select(reverse(df.data).alias('r')).collect() +# osos.functions.reverse: +df = OsosSession.createDataFrame([("Spark SQL",)], ["data"]) +df.select(reverse(df.data).alias("s")).collect() +df = OsosSession.createDataFrame([([2, 1, 3],), ([1],), ([],)], ["data"]) +df.select(reverse(df.data).alias("r")).collect() -#osos.functions.flatten: -df = OsosSession.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ['data']) +# osos.functions.flatten: +df = OsosSession.createDataFrame( + [([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ["data"] +) df.show(truncate=False) -df.select(flatten(df.data).alias('r')).show() +df.select(flatten(df.data).alias("r")).show() -#osos.functions.sequence: -df1 = OsosSession.createDataFrame([(-2, 2)], ('C1', 'C2')) -df1.select(sequence('C1', 'C2').alias('r')).collect() -df2 = OsosSession.createDataFrame([(4, -4, -2)], ('C1', 'C2', 'C3')) -df2.select(sequence('C1', 'C2', 'C3').alias('r')).collect() +# osos.functions.sequence: +df1 = OsosSession.createDataFrame([(-2, 2)], ("C1", "C2")) +df1.select(sequence("C1", "C2").alias("r")).collect() +df2 = OsosSession.createDataFrame([(4, -4, -2)], ("C1", "C2", "C3")) +df2.select(sequence("C1", "C2", "C3").alias("r")).collect() -#osos.functions.array_repeat: -df = OsosSession.createDataFrame([('ab',)], ['data']) -df.select(array_repeat(df.data, 3).alias('r')).collect() +# osos.functions.array_repeat: +df = OsosSession.createDataFrame([("ab",)], ["data"]) +df.select(array_repeat(df.data, 3).alias("r")).collect() -#osos.functions.map_contains_key: +# osos.functions.map_contains_key: from osos.functions import map_contains_key + df = OsosSession.sql("SELECT map(1, 'a', 2, 'b') as data") df.select(map_contains_key("data", 1)).show() df.select(map_contains_key("data", -1)).show() -#osos.functions.map_keys: +# osos.functions.map_keys: from osos.functions import map_keys + df = OsosSession.sql("SELECT map(1, 'a', 2, 'b') as data") df.select(map_keys("data").alias("keys")).show() -#osos.functions.map_values: +# osos.functions.map_values: from osos.functions import map_values + df = OsosSession.sql("SELECT map(1, 'a', 2, 'b') as data") df.select(map_values("data").alias("values")).show() -#osos.functions.map_entries: +# osos.functions.map_entries: from osos.functions import map_entries + df = OsosSession.sql("SELECT map(1, 'a', 2, 'b') as data") df = df.select(map_entries("data").alias("entries")) df.show() df.printSchema() -#osos.functions.map_from_entries: +# osos.functions.map_from_entries: from osos.functions import map_from_entries + df = OsosSession.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data") df.select(map_from_entries("data").alias("map")).show() -#osos.functions.arrays_zip: +# osos.functions.arrays_zip: from osos.functions import arrays_zip -df = OsosSession.createDataFrame([(([1, 2, 3], [2, 4, 6], [3, 6]))], ['vals1', 'vals2', 'vals3']) -df = df.select(arrays_zip(df.vals1, df.vals2, df.vals3).alias('zipped')) + +df = OsosSession.createDataFrame( + [(([1, 2, 3], [2, 4, 6], [3, 6]))], ["vals1", "vals2", "vals3"] +) +df = df.select(arrays_zip(df.vals1, df.vals2, df.vals3).alias("zipped")) df.show(truncate=False) df.printSchema() -#osos.functions.map_concat: +# osos.functions.map_concat: from osos.functions import map_concat + df = OsosSession.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2") df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False) -#osos.functions.from_csv: +# osos.functions.from_csv: data = [("1,2,3",)] df = OsosSession.createDataFrame(data, ("value",)) df.select(from_csv(df.value, "a INT, b INT, c INT").alias("csv")).collect() @@ -869,601 +974,723 @@ def merge(acc, x): df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect() data = [(" abc",)] df = OsosSession.createDataFrame(data, ("value",)) -options = {'ignoreLeadingWhiteSpace': True} +options = {"ignoreLeadingWhiteSpace": True} df.select(from_csv(df.value, "s string", options).alias("csv")).collect() -#osos.functions.schema_of_csv: +# osos.functions.schema_of_csv: df = OsosSession.range(1) -df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect() -df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect() +df.select(schema_of_csv(lit("1|a"), {"sep": "|"}).alias("csv")).collect() +df.select(schema_of_csv("1|a", {"sep": "|"}).alias("csv")).collect() -#osos.functions.to_csv: +# osos.functions.to_csv: from osos import Row -data = [(1, Row(age=2, name='Alice'))] + +data = [(1, Row(age=2, name="Alice"))] df = OsosSession.createDataFrame(data, ("key", "value")) df.select(to_csv(df.value).alias("csv")).collect() -#osos.functions.years: -df.writeTo("catalog.db.table").partitionedBy( -years("ts") -).createOrReplace() - -#osos.functions.months: -df.writeTo("catalog.db.table").partitionedBy( -months("ts") -).createOrReplace() - -#osos.functions.days: -df.writeTo("catalog.db.table").partitionedBy( -days("ts") -).createOrReplace() - -#osos.functions.hours: -df.writeTo("catalog.db.table").partitionedBy( -hours("ts") -).createOrReplace() - -#osos.functions.bucket: -df.writeTo("catalog.db.table").partitionedBy( -bucket(42, "ts") -).createOrReplace() - -#osos.functions.approxCountDistinct: -#osos.functions.approx_count_distinct: -df = OsosSession.createDataFrame([1,2,2,3], "INT") -df.agg(approx_count_distinct("value").alias('distinct_values')).show() - -#osos.functions.avg: +# osos.functions.years: +df.writeTo("catalog.db.table").partitionedBy(years("ts")).createOrReplace() + +# osos.functions.months: +df.writeTo("catalog.db.table").partitionedBy(months("ts")).createOrReplace() + +# osos.functions.days: +df.writeTo("catalog.db.table").partitionedBy(days("ts")).createOrReplace() + +# osos.functions.hours: +df.writeTo("catalog.db.table").partitionedBy(hours("ts")).createOrReplace() + +# osos.functions.bucket: +df.writeTo("catalog.db.table").partitionedBy(bucket(42, "ts")).createOrReplace() + +# osos.functions.approxCountDistinct: +# osos.functions.approx_count_distinct: +df = OsosSession.createDataFrame([1, 2, 2, 3], "INT") +df.agg(approx_count_distinct("value").alias("distinct_values")).show() + +# osos.functions.avg: df = OsosSession.range(10) df.select(avg(col("id"))).show() -#osos.functions.collect_list: -df2 = OsosSession.createDataFrame([(2,), (5,), (5,)], ('age',)) -df2.agg(collect_list('age')).collect() +# osos.functions.collect_list: +df2 = OsosSession.createDataFrame([(2,), (5,), (5,)], ("age",)) +df2.agg(collect_list("age")).collect() -#osos.functions.collect_set: -df2 = OsosSession.createDataFrame([(2,), (5,), (5,)], ('age',)) -df2.agg(array_sort(collect_set('age')).alias('c')).collect() +# osos.functions.collect_set: +df2 = OsosSession.createDataFrame([(2,), (5,), (5,)], ("age",)) +df2.agg(array_sort(collect_set("age")).alias("c")).collect() -#osos.functions.corr: +# osos.functions.corr: a = range(20) b = [2 * x for x in range(20)] df = OsosSession.createDataFrame(zip(a, b), ["a", "b"]) -df.agg(corr("a", "b").alias('c')).collect() +df.agg(corr("a", "b").alias("c")).collect() -#osos.functions.count: -df = OsosSession.createDataFrame([(None,), ("a",), ("b",), ("c",)], schema=["alphabets"]) +# osos.functions.count: +df = OsosSession.createDataFrame( + [(None,), ("a",), ("b",), ("c",)], schema=["alphabets"] +) df.select(count(expr("*")), count(df.alphabets)).show() -#osos.functions.count_distinct: +# osos.functions.count_distinct: from osos import types + df1 = OsosSession.createDataFrame([1, 1, 3], types.IntegerType()) df2 = OsosSession.createDataFrame([1, 2], types.IntegerType()) df1.join(df2).show() df1.join(df2).select(count_distinct(df1.value, df2.value)).show() -#osos.functions.countDistinct: -#osos.functions.covar_pop: +# osos.functions.countDistinct: +# osos.functions.covar_pop: a = [1] * 10 b = [1] * 10 df = OsosSession.createDataFrame(zip(a, b), ["a", "b"]) -df.agg(covar_pop("a", "b").alias('c')).collect() +df.agg(covar_pop("a", "b").alias("c")).collect() -#osos.functions.covar_samp: +# osos.functions.covar_samp: a = [1] * 10 b = [1] * 10 df = OsosSession.createDataFrame(zip(a, b), ["a", "b"]) -df.agg(covar_samp("a", "b").alias('c')).collect() +df.agg(covar_samp("a", "b").alias("c")).collect() -#osos.functions.first: -df = OsosSession.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age")) +# osos.functions.first: +df = OsosSession.createDataFrame( + [("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age") +) df = df.orderBy(df.age) df.groupby("name").agg(first("age")).orderBy("name").show() df.groupby("name").agg(first("age", ignorenulls=True)).orderBy("name").show() -#osos.functions.grouping: +# osos.functions.grouping: df = OsosSession.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show() -#osos.functions.grouping_id: -df = OsosSession.createDataFrame([(1, "a", "a"), -(3, "a", "a"), -(4, "b", "c")], ["c1", "c2", "c3"]) +# osos.functions.grouping_id: +df = OsosSession.createDataFrame( + [(1, "a", "a"), (3, "a", "a"), (4, "b", "c")], ["c1", "c2", "c3"] +) df.cube("c2", "c3").agg(grouping_id(), sum("c1")).orderBy("c2", "c3").show() -#osos.functions.kurtosis: -df = OsosSession.createDataFrame([[1],[1],[2]], ["c"]) +# osos.functions.kurtosis: +df = OsosSession.createDataFrame([[1], [1], [2]], ["c"]) df.select(kurtosis(df.c)).show() -#osos.functions.last: -df = OsosSession.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age")) +# osos.functions.last: +df = OsosSession.createDataFrame( + [("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age") +) df = df.orderBy(df.age.desc()) df.groupby("name").agg(last("age")).orderBy("name").show() df.groupby("name").agg(last("age", ignorenulls=True)).orderBy("name").show() -#osos.functions.max: +# osos.functions.max: df = OsosSession.range(10) df.select(max(col("id"))).show() -#osos.functions.max_by: -df = OsosSession.createDataFrame([ -("Java", 2012, 20000), ("dotNET", 2012, 5000), -("dotNET", 2013, 48000), ("Java", 2013, 30000)], -schema=("course", "year", "earnings")) +# osos.functions.max_by: +df = OsosSession.createDataFrame( + [ + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("dotNET", 2013, 48000), + ("Java", 2013, 30000), + ], + schema=("course", "year", "earnings"), +) df.groupby("course").agg(max_by("year", "earnings")).show() -#osos.functions.mean: +# osos.functions.mean: df = OsosSession.range(10) df.select(mean(df.id)).show() -#osos.functions.median: -df = OsosSession.createDataFrame([ -("Java", 2012, 20000), ("dotNET", 2012, 5000), -("Java", 2012, 22000), ("dotNET", 2012, 10000), -("dotNET", 2013, 48000), ("Java", 2013, 30000)], -schema=("course", "year", "earnings")) +# osos.functions.median: +df = OsosSession.createDataFrame( + [ + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("Java", 2012, 22000), + ("dotNET", 2012, 10000), + ("dotNET", 2013, 48000), + ("Java", 2013, 30000), + ], + schema=("course", "year", "earnings"), +) df.groupby("course").agg(median("earnings")).show() -#osos.functions.min: +# osos.functions.min: df = OsosSession.range(10) df.select(min(df.id)).show() -#osos.functions.min_by: -df = OsosSession.createDataFrame([ -("Java", 2012, 20000), ("dotNET", 2012, 5000), -("dotNET", 2013, 48000), ("Java", 2013, 30000)], -schema=("course", "year", "earnings")) +# osos.functions.min_by: +df = OsosSession.createDataFrame( + [ + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("dotNET", 2013, 48000), + ("Java", 2013, 30000), + ], + schema=("course", "year", "earnings"), +) df.groupby("course").agg(min_by("year", "earnings")).show() -#osos.functions.mode: -df = OsosSession.createDataFrame([ -("Java", 2012, 20000), ("dotNET", 2012, 5000), -("Java", 2012, 20000), ("dotNET", 2012, 5000), -("dotNET", 2013, 48000), ("Java", 2013, 30000)], -schema=("course", "year", "earnings")) +# osos.functions.mode: +df = OsosSession.createDataFrame( + [ + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("Java", 2012, 20000), + ("dotNET", 2012, 5000), + ("dotNET", 2013, 48000), + ("Java", 2013, 30000), + ], + schema=("course", "year", "earnings"), +) df.groupby("course").agg(mode("year")).show() -#osos.functions.percentile_approx: +# osos.functions.percentile_approx: key = (col("id") % 3).alias("key") value = (randn(42) + key * 10).alias("value") df = OsosSession.range(0, 1000, 1, 1).select(key, value) df.select( -percentile_approx("value", [0.25, 0.5, 0.75], 1000000).alias("quantiles") + percentile_approx("value", [0.25, 0.5, 0.75], 1000000).alias("quantiles") ).printSchema() df.groupBy("key").agg( -percentile_approx("value", 0.5, lit(1000000)).alias("median") + percentile_approx("value", 0.5, lit(1000000)).alias("median") ).printSchema() -#osos.functions.product: -df = OsosSession.range(1, 10).toDF('x').withColumn('mod3', col('x') % 3) -prods = df.groupBy('mod3').agg(product('x').alias('product')) -prods.orderBy('mod3').show() +# osos.functions.product: +df = OsosSession.range(1, 10).toDF("x").withColumn("mod3", col("x") % 3) +prods = df.groupBy("mod3").agg(product("x").alias("product")) +prods.orderBy("mod3").show() -#osos.functions.skewness: -df = OsosSession.createDataFrame([[1],[1],[2]], ["c"]) +# osos.functions.skewness: +df = OsosSession.createDataFrame([[1], [1], [2]], ["c"]) df.select(skewness(df.c)).first() -#osos.functions.stddev: +# osos.functions.stddev: df = OsosSession.range(6) df.select(stddev(df.id)).first() -#osos.functions.stddev_pop: +# osos.functions.stddev_pop: df = OsosSession.range(6) df.select(stddev_pop(df.id)).first() -#osos.functions.stddev_samp: +# osos.functions.stddev_samp: df = OsosSession.range(6) df.select(stddev_samp(df.id)).first() -#osos.functions.sum: +# osos.functions.sum: df = OsosSession.range(10) df.select(sum(df["id"])).show() -#osos.functions.sum_distinct: +# osos.functions.sum_distinct: df = OsosSession.createDataFrame([(None,), (1,), (1,), (2,)], schema=["numbers"]) df.select(sum_distinct(col("numbers"))).show() -#osos.functions.sumDistinct: -#osos.functions.var_pop: +# osos.functions.sumDistinct: +# osos.functions.var_pop: df = OsosSession.range(6) df.select(var_pop(df.id)).first() -#osos.functions.var_samp: +# osos.functions.var_samp: df = OsosSession.range(6) df.select(var_samp(df.id)).show() -#osos.functions.variance: +# osos.functions.variance: df = OsosSession.range(6) df.select(variance(df.id)).show() -#osos.functions.cume_dist: +# osos.functions.cume_dist: from osos import Window, types + df = OsosSession.createDataFrame([1, 2, 3, 3, 4], types.IntegerType()) w = Window.orderBy("value") df.withColumn("cd", cume_dist().over(w)).show() -#osos.functions.dense_rank: +# osos.functions.dense_rank: from osos import Window, types + df = OsosSession.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType()) w = Window.orderBy("value") df.withColumn("drank", dense_rank().over(w)).show() -#osos.functions.lag: +# osos.functions.lag: from osos import Window -df = OsosSession.createDataFrame([("a", 1), -("a", 2), -("a", 3), -("b", 8), -("b", 2)], ["c1", "c2"]) + +df = OsosSession.createDataFrame( + [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"] +) df.show() w = Window.partitionBy("c1").orderBy("c2") df.withColumn("previos_value", lag("c2").over(w)).show() df.withColumn("previos_value", lag("c2", 1, 0).over(w)).show() df.withColumn("previos_value", lag("c2", 2, -1).over(w)).show() -#osos.functions.lead: +# osos.functions.lead: from osos import Window -df = OsosSession.createDataFrame([("a", 1), -("a", 2), -("a", 3), -("b", 8), -("b", 2)], ["c1", "c2"]) + +df = OsosSession.createDataFrame( + [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"] +) df.show() w = Window.partitionBy("c1").orderBy("c2") df.withColumn("next_value", lead("c2").over(w)).show() df.withColumn("next_value", lead("c2", 1, 0).over(w)).show() df.withColumn("next_value", lead("c2", 2, -1).over(w)).show() -#osos.functions.nth_value: +# osos.functions.nth_value: from osos import Window -df = OsosSession.createDataFrame([("a", 1), -("a", 2), -("a", 3), -("b", 8), -("b", 2)], ["c1", "c2"]) + +df = OsosSession.createDataFrame( + [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"] +) df.show() w = Window.partitionBy("c1").orderBy("c2") df.withColumn("nth_value", nth_value("c2", 1).over(w)).show() df.withColumn("nth_value", nth_value("c2", 2).over(w)).show() -#osos.functions.ntile: +# osos.functions.ntile: from osos import Window -df = OsosSession.createDataFrame([("a", 1), -("a", 2), -("a", 3), -("b", 8), -("b", 2)], ["c1", "c2"]) + +df = OsosSession.createDataFrame( + [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"] +) df.show() w = Window.partitionBy("c1").orderBy("c2") df.withColumn("ntile", ntile(2).over(w)).show() -#osos.functions.percent_rank: +# osos.functions.percent_rank: from osos import Window, types + df = OsosSession.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType()) w = Window.orderBy("value") df.withColumn("pr", percent_rank().over(w)).show() -#osos.functions.rank: +# osos.functions.rank: from osos import Window, types + df = OsosSession.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType()) w = Window.orderBy("value") df.withColumn("drank", rank().over(w)).show() -#osos.functions.row_number: +# osos.functions.row_number: from osos import Window + df = OsosSession.range(3) w = Window.orderBy(df.id.desc()) df.withColumn("desc_order", row_number().over(w)).show() -#osos.functions.asc: +# osos.functions.asc: df = OsosSession.range(5) df = df.sort(desc("id")) df.show() df.orderBy(asc("id")).show() -#osos.functions.asc_nulls_first: -df1 = OsosSession.createDataFrame([(1, "Bob"), -(0, None), -(2, "Alice")], ["age", "name"]) +# osos.functions.asc_nulls_first: +df1 = OsosSession.createDataFrame( + [(1, "Bob"), (0, None), (2, "Alice")], ["age", "name"] +) df1.sort(asc_nulls_first(df1.name)).show() -#osos.functions.asc_nulls_last: -df1 = OsosSession.createDataFrame([(0, None), -(1, "Bob"), -(2, "Alice")], ["age", "name"]) +# osos.functions.asc_nulls_last: +df1 = OsosSession.createDataFrame( + [(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"] +) df1.sort(asc_nulls_last(df1.name)).show() -#osos.functions.desc: +# osos.functions.desc: OsosSession.range(5).orderBy(desc("id")).show() -#osos.functions.desc_nulls_first: -df1 = OsosSession.createDataFrame([(0, None), -(1, "Bob"), -(2, "Alice")], ["age", "name"]) +# osos.functions.desc_nulls_first: +df1 = OsosSession.createDataFrame( + [(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"] +) df1.sort(desc_nulls_first(df1.name)).show() -#osos.functions.desc_nulls_last: -df1 = OsosSession.createDataFrame([(0, None), -(1, "Bob"), -(2, "Alice")], ["age", "name"]) +# osos.functions.desc_nulls_last: +df1 = OsosSession.createDataFrame( + [(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"] +) df1.sort(desc_nulls_last(df1.name)).show() -#osos.functions.ascii: +# osos.functions.ascii: df = OsosSession.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") df.select(ascii("value")).show() -#osos.functions.base64: +# osos.functions.base64: df = OsosSession.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") df.select(base64("value")).show() -#osos.functions.bit_length: +# osos.functions.bit_length: from osos.functions import bit_length -OsosSession.createDataFrame([('cat',), ( '🐈',)], ['cat']).select(bit_length('cat')).collect() -#osos.functions.concat_ws: -df = OsosSession.createDataFrame([('abcd','123')], ['s', 'd']) -df.select(concat_ws('-', df.s, df.d).alias('s')).collect() +OsosSession.createDataFrame([("cat",), ("🐈",)], ["cat"]).select( + bit_length("cat") +).collect() + +# osos.functions.concat_ws: +df = OsosSession.createDataFrame([("abcd", "123")], ["s", "d"]) +df.select(concat_ws("-", df.s, df.d).alias("s")).collect() -#osos.functions.decode: -df = OsosSession.createDataFrame([('abcd',)], ['a']) +# osos.functions.decode: +df = OsosSession.createDataFrame([("abcd",)], ["a"]) df.select(decode("a", "UTF-8")).show() -#osos.functions.encode: -df = OsosSession.createDataFrame([('abcd',)], ['c']) +# osos.functions.encode: +df = OsosSession.createDataFrame([("abcd",)], ["c"]) df.select(encode("c", "UTF-8")).show() -#osos.functions.format_number: -OsosSession.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() +# osos.functions.format_number: +OsosSession.createDataFrame([(5,)], ["a"]).select( + format_number("a", 4).alias("v") +).collect() -#osos.functions.format_string: -df = OsosSession.createDataFrame([(5, "hello")], ['a', 'b']) -df.select(format_string('%d %s', df.a, df.b).alias('v')).collect() +# osos.functions.format_string: +df = OsosSession.createDataFrame([(5, "hello")], ["a", "b"]) +df.select(format_string("%d %s", df.a, df.b).alias("v")).collect() -#osos.functions.initcap: -OsosSession.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect() +# osos.functions.initcap: +OsosSession.createDataFrame([("ab cd",)], ["a"]).select( + initcap("a").alias("v") +).collect() -#osos.functions.instr: -df = OsosSession.createDataFrame([('abcd',)], ['s',]) -df.select(instr(df.s, 'b').alias('s')).collect() +# osos.functions.instr: +df = OsosSession.createDataFrame( + [("abcd",)], + [ + "s", + ], +) +df.select(instr(df.s, "b").alias("s")).collect() -#osos.functions.length: -OsosSession.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect() +# osos.functions.length: +OsosSession.createDataFrame([("ABC ",)], ["a"]).select( + length("a").alias("length") +).collect() -#osos.functions.lower: +# osos.functions.lower: df = OsosSession.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") df.select(lower("value")).show() -#osos.functions.levenshtein: -df0 = OsosSession.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) -df0.select(levenshtein('l', 'r').alias('d')).collect() +# osos.functions.levenshtein: +df0 = OsosSession.createDataFrame( + [ + ( + "kitten", + "sitting", + ) + ], + ["l", "r"], +) +df0.select(levenshtein("l", "r").alias("d")).collect() -#osos.functions.locate: -df = OsosSession.createDataFrame([('abcd',)], ['s',]) -df.select(locate('b', df.s, 1).alias('s')).collect() +# osos.functions.locate: +df = OsosSession.createDataFrame( + [("abcd",)], + [ + "s", + ], +) +df.select(locate("b", df.s, 1).alias("s")).collect() -#osos.functions.lpad: -df = OsosSession.createDataFrame([('abcd',)], ['s',]) -df.select(lpad(df.s, 6, '#').alias('s')).collect() +# osos.functions.lpad: +df = OsosSession.createDataFrame( + [("abcd",)], + [ + "s", + ], +) +df.select(lpad(df.s, 6, "#").alias("s")).collect() -#osos.functions.ltrim: +# osos.functions.ltrim: df = OsosSession.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") df.select(ltrim("value").alias("r")).withColumn("length", length("r")).show() -#osos.functions.octet_length: +# osos.functions.octet_length: from osos.functions import octet_length -OsosSession.createDataFrame([('cat',), ( '🐈',)], ['cat']).select(octet_length('cat')).collect() - -#osos.functions.regexp_extract: -df = OsosSession.createDataFrame([('100-200',)], ['str']) -df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect() -df = OsosSession.createDataFrame([('foo',)], ['str']) -df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect() -df = OsosSession.createDataFrame([('aaaac',)], ['str']) -df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() - -#osos.functions.regexp_replace: -df = OsosSession.createDataFrame([("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"]) -df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect() -df.select(regexp_replace("str", col("pattern"), col("replacement")).alias('d')).collect() - -#osos.functions.unbase64: -df = OsosSession.createDataFrame(["U3Bhcms=", -"UHlTcGFyaw==", -"UGFuZGFzIEFQSQ=="], "STRING") + +OsosSession.createDataFrame([("cat",), ("🐈",)], ["cat"]).select( + octet_length("cat") +).collect() + +# osos.functions.regexp_extract: +df = OsosSession.createDataFrame([("100-200",)], ["str"]) +df.select(regexp_extract("str", r"(\d+)-(\d+)", 1).alias("d")).collect() +df = OsosSession.createDataFrame([("foo",)], ["str"]) +df.select(regexp_extract("str", r"(\d+)", 1).alias("d")).collect() +df = OsosSession.createDataFrame([("aaaac",)], ["str"]) +df.select(regexp_extract("str", "(a+)(b)?(c)", 2).alias("d")).collect() + +# osos.functions.regexp_replace: +df = OsosSession.createDataFrame( + [("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"] +) +df.select(regexp_replace("str", r"(\d+)", "--").alias("d")).collect() +df.select( + regexp_replace("str", col("pattern"), col("replacement")).alias("d") +).collect() + +# osos.functions.unbase64: +df = OsosSession.createDataFrame( + ["U3Bhcms=", "UHlTcGFyaw==", "UGFuZGFzIEFQSQ=="], "STRING" +) df.select(unbase64("value")).show() -#osos.functions.rpad: -df = OsosSession.createDataFrame([('abcd',)], ['s',]) -df.select(rpad(df.s, 6, '#').alias('s')).collect() +# osos.functions.rpad: +df = OsosSession.createDataFrame( + [("abcd",)], + [ + "s", + ], +) +df.select(rpad(df.s, 6, "#").alias("s")).collect() -#osos.functions.repeat: -df = OsosSession.createDataFrame([('ab',)], ['s',]) -df.select(repeat(df.s, 3).alias('s')).collect() +# osos.functions.repeat: +df = OsosSession.createDataFrame( + [("ab",)], + [ + "s", + ], +) +df.select(repeat(df.s, 3).alias("s")).collect() -#osos.functions.rtrim: +# osos.functions.rtrim: df = OsosSession.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") df.select(rtrim("value").alias("r")).withColumn("length", length("r")).show() -#osos.functions.soundex: -df = OsosSession.createDataFrame([("Peters",),("Uhrbach",)], ['name']) +# osos.functions.soundex: +df = OsosSession.createDataFrame([("Peters",), ("Uhrbach",)], ["name"]) df.select(soundex(df.name).alias("soundex")).collect() -#osos.functions.split: -df = OsosSession.createDataFrame([('oneAtwoBthreeC',)], ['s',]) -df.select(split(df.s, '[ABC]', 2).alias('s')).collect() -df.select(split(df.s, '[ABC]', -1).alias('s')).collect() +# osos.functions.split: +df = OsosSession.createDataFrame( + [("oneAtwoBthreeC",)], + [ + "s", + ], +) +df.select(split(df.s, "[ABC]", 2).alias("s")).collect() +df.select(split(df.s, "[ABC]", -1).alias("s")).collect() -#osos.functions.substring: -df = OsosSession.createDataFrame([('abcd',)], ['s',]) -df.select(substring(df.s, 1, 2).alias('s')).collect() +# osos.functions.substring: +df = OsosSession.createDataFrame( + [("abcd",)], + [ + "s", + ], +) +df.select(substring(df.s, 1, 2).alias("s")).collect() -#osos.functions.substring_index: -df = OsosSession.createDataFrame([('a.b.c.d',)], ['s']) -df.select(substring_index(df.s, '.', 2).alias('s')).collect() -df.select(substring_index(df.s, '.', -3).alias('s')).collect() +# osos.functions.substring_index: +df = OsosSession.createDataFrame([("a.b.c.d",)], ["s"]) +df.select(substring_index(df.s, ".", 2).alias("s")).collect() +df.select(substring_index(df.s, ".", -3).alias("s")).collect() -#osos.functions.overlay: +# osos.functions.overlay: df = OsosSession.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y")) df.select(overlay("x", "y", 7).alias("overlayed")).collect() df.select(overlay("x", "y", 7, 0).alias("overlayed")).collect() df.select(overlay("x", "y", 7, 2).alias("overlayed")).collect() -#osos.functions.sentences: +# osos.functions.sentences: df = OsosSession.createDataFrame([["This is an example sentence."]], ["string"]) df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False) df = OsosSession.createDataFrame([["Hello world. How are you?"]], ["s"]) df.select(sentences("s")).show(truncate=False) -#osos.functions.translate: -OsosSession.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123").alias('r')).collect() +# osos.functions.translate: +OsosSession.createDataFrame([("translate",)], ["a"]).select( + translate("a", "rnlt", "123").alias("r") +).collect() -#osos.functions.trim: +# osos.functions.trim: df = OsosSession.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") df.select(trim("value").alias("r")).withColumn("length", length("r")).show() -#osos.functions.upper: +# osos.functions.upper: df = OsosSession.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") df.select(upper("value")).show() -#osos.functions.call_udf: +# osos.functions.call_udf: from osos.functions import call_udf, col from osos.types import IntegerType, StringType -df = OsosSession.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"]) + +df = OsosSession.createDataFrame([(1, "a"), (2, "b"), (3, "c")], ["id", "name"]) _ = OsosSession.udf.register("intX2", lambda i: i * 2, IntegerType()) df.select(call_udf("intX2", "id")).show() _ = OsosSession.udf.register("strX2", lambda s: s * 2, StringType()) df.select(call_udf("strX2", col("name"))).show() -#osos.functions.pandas_udf: +# osos.functions.pandas_udf: import pandas as pd from osos.functions import pandas_udf + @pandas_udf(IntegerType()) def slen(s: pd.Series) -> pd.Series: return s.str.len() + from osos.functions import PandasUDFType from osos.types import IntegerType + + @pandas_udf(IntegerType(), PandasUDFType.SCALAR) def slen(s): return s.str.len() + @pandas_udf("col1 string, col2 long") def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame: - s3['col2'] = s1 + s2.str.len() + s3["col2"] = s1 + s2.str.len() return s3 + + # Create a Spark DataFrame that has three columns including a struct column df = OsosSession.createDataFrame( -[[1, "a string", ("a nested string",)]], -"long_col long, string_col string, struct_col struct") + [[1, "a string", ("a nested string",)]], + "long_col long, string_col string, struct_col struct", +) df.printSchema() df.select(func("long_col", "string_col", "struct_col")).printSchema() + @pandas_udf("string") def to_upper(s: pd.Series) -> pd.Series: return s.str.upper() + + df = OsosSession.createDataFrame([("John Doe",)], ("name",)) df.select(to_upper("name")).show() + @pandas_udf("first string, last string") def split_expand(s: pd.Series) -> pd.DataFrame: return s.str.split(expand=True) + + df = OsosSession.createDataFrame([("John Doe",)], ("name",)) df.select(split_expand("name")).show() from typing import Iterator + + @pandas_udf("long") def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]: for s in iterator: yield s + 1 + + df = OsosSession.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"])) df.select(plus_one(df.v)).show() from typing import Iterator, Tuple from osos.functions import struct, col + + @pandas_udf("long") def multiply(iterator: Iterator[Tuple[pd.Series, pd.DataFrame]]) -> Iterator[pd.Series]: for s1, df in iterator: yield s1 * df.v + + df = OsosSession.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"])) -df.withColumn('output', multiply(col("v"), struct(col("v")))).show() +df.withColumn("output", multiply(col("v"), struct(col("v")))).show() + @pandas_udf("double") def mean_udf(v: pd.Series) -> float: return v.mean() + + df = OsosSession.createDataFrame( -[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) -df.groupby("id").agg(mean_udf(df['v'])).show() + [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v") +) +df.groupby("id").agg(mean_udf(df["v"])).show() from osos import Window + + @pandas_udf("double") def mean_udf(v: pd.Series) -> float: return v.mean() + + df = OsosSession.createDataFrame( -[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) -w = Window.partitionBy('id').orderBy('v').rowsBetween(-1, 0) -df.withColumn('mean_v', mean_udf("v").over(w)).show() + [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v") +) +w = Window.partitionBy("id").orderBy("v").rowsBetween(-1, 0) +df.withColumn("mean_v", mean_udf("v").over(w)).show() -#osos.functions.udf: +# osos.functions.udf: from osos.types import IntegerType import random + random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic() from osos.types import IntegerType + slen = udf(lambda s: len(s), IntegerType()) + + @udf def to_upper(s): if s is not None: return s.upper() + @udf(returnType=IntegerType()) def add_one(x): if x is not None: return x + 1 + + df = OsosSession.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show() -#osos.functions.unwrap_udt: -#osos.functions.md5: -OsosSession.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() +# osos.functions.unwrap_udt: +# osos.functions.md5: +OsosSession.createDataFrame([("ABC",)], ["a"]).select(md5("a").alias("hash")).collect() -#osos.functions.sha1: -OsosSession.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() +# osos.functions.sha1: +OsosSession.createDataFrame([("ABC",)], ["a"]).select(sha1("a").alias("hash")).collect() -#osos.functions.sha2: +# osos.functions.sha2: df = OsosSession.createDataFrame([["Alice"], ["Bob"]], ["name"]) df.withColumn("sha2", sha2(df.name, 256)).show(truncate=False) -#osos.functions.crc32: -OsosSession.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect() +# osos.functions.crc32: +OsosSession.createDataFrame([("ABC",)], ["a"]).select( + crc32("a").alias("crc32") +).collect() -#osos.functions.hash: -df = OsosSession.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) +# osos.functions.hash: +df = OsosSession.createDataFrame([("ABC", "DEF")], ["c1", "c2"]) -df.select(hash('c1').alias('hash')).show() +df.select(hash("c1").alias("hash")).show() -df.select(hash('c1', 'c2').alias('hash')).show() +df.select(hash("c1", "c2").alias("hash")).show() -#osos.functions.xxhash64: -df = OsosSession.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) +# osos.functions.xxhash64: +df = OsosSession.createDataFrame([("ABC", "DEF")], ["c1", "c2"]) -df.select(xxhash64('c1').alias('hash')).show() +df.select(xxhash64("c1").alias("hash")).show() -df.select(xxhash64('c1', 'c2').alias('hash')).show() +df.select(xxhash64("c1", "c2").alias("hash")).show() -#osos.functions.assert_true: -df = OsosSession.createDataFrame([(0,1)], ['a', 'b']) -df.select(assert_true(df.a < df.b).alias('r')).collect() -df.select(assert_true(df.a < df.b, df.a).alias('r')).collect() -df.select(assert_true(df.a < df.b, 'error').alias('r')).collect() -df.select(assert_true(df.a > df.b, 'My error msg').alias('r')).collect() +# osos.functions.assert_true: +df = OsosSession.createDataFrame([(0, 1)], ["a", "b"]) +df.select(assert_true(df.a < df.b).alias("r")).collect() +df.select(assert_true(df.a < df.b, df.a).alias("r")).collect() +df.select(assert_true(df.a < df.b, "error").alias("r")).collect() +df.select(assert_true(df.a > df.b, "My error msg").alias("r")).collect() -#osos.functions.raise_error: +# osos.functions.raise_error: df = OsosSession.range(1) df.select(raise_error("My error message")).show() - diff --git a/tests/scrape.py b/tests/scrape.py index 32b447d..7b9520d 100644 --- a/tests/scrape.py +++ b/tests/scrape.py @@ -7,38 +7,39 @@ htmlref = BeautifulSoup(response.text, "html.parser") -#grab all tags with class "reference internal" and get href links +# grab all tags with class "reference internal" and get href links hrefs = [a["href"] for a in htmlref.find_all("a", class_="reference internal")] testdict = {} -for href in hrefs: - url = "https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/" + href - response = requests.get(url) - func_page = BeautifulSoup(response.text, "html.parser") - pre_tags = func_page.find_all("pre") - examples = [] - for p in pre_tags: - pre_content = p.get_text() - examples.append(pre_content) - testdict[href] = examples +for href in hrefs: + url = ( + "https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/" + href + ) + response = requests.get(url) + func_page = BeautifulSoup(response.text, "html.parser") + pre_tags = func_page.find_all("pre") + examples = [] + for p in pre_tags: + pre_content = p.get_text() + examples.append(pre_content) + testdict[href] = examples key = list(testdict.keys())[0] with open("auto_test.py", "w") as file: - #loop through functions + # loop through functions for key in testdict.keys(): - #grab function name - func_name = key.split("#")[1] - #grab function examples - examples = testdict[key] - #write function examples to file - if "function" in func_name: - file.write(f"#{func_name}:\n") - for chunk in examples: - code_lines= chunk.split('\n') - for line in code_lines: - if line.startswith('>>> ') or line.startswith('... '): - line = line.strip('>>> ').strip('... ') - file.write(line + '\n') - file.write('\n') - + # grab function name + func_name = key.split("#")[1] + # grab function examples + examples = testdict[key] + # write function examples to file + if "function" in func_name: + file.write(f"#{func_name}:\n") + for chunk in examples: + code_lines = chunk.split("\n") + for line in code_lines: + if line.startswith(">>> ") or line.startswith("... "): + line = line.strip(">>> ").strip("... ") + file.write(line + "\n") + file.write("\n") diff --git a/tests/test_basic.py b/tests/test_basic.py index e37a0e1..fea4991 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -59,8 +59,7 @@ v = one.agg(F.median("baz").alias("baz")) w = one.withColumn("tup", F.upper("tup")) df = OsosSession.range(3) -x = df.select(F.when(df['id'] == 2, 3).otherwise(4).alias("age")) - +x = df.select(F.when(df["id"] == 2, 3).otherwise(4).alias("age")) ap = one._data.assign(boo=one._data["foo"] + one._data["baz"]) @@ -111,7 +110,7 @@ up = one._data.assign(**{"foosqrt": np.sqrt(one._data.foo)}) vp = pd.DataFrame(one._data.agg({"baz": np.median})).T wp = one._data.assign(tup=one._data["tup"].str.upper()) -xp = pd.DataFrame({'age':[4,4,3]}) +xp = pd.DataFrame({"age": [4, 4, 3]}) def compares_equal(osos_dataframe: DataFrame, pandas_dataframe: pd.DataFrame) -> bool: @@ -149,7 +148,7 @@ def test_functions(): assert compares_equal(s, sp) assert compares_equal(t, tp) assert compares_equal(u, up) - assert compares_equal(x,xp) + assert compares_equal(x, xp) iris_pd = pd.read_csv(