diff --git a/doc/spark_coverage.txt b/doc/spark_coverage.txt new file mode 100644 index 000000000..f7ef388e1 --- /dev/null +++ b/doc/spark_coverage.txt @@ -0,0 +1,421 @@ ++---------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|name |details | ++---------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|! |{PASSED, [{SELECT ! true;, OK}]} | +|% |{PASSED, [{SELECT 2 % 1.8;, OK}]} | +|& |{PASSED, [{SELECT 3 & 5;, OK}]} | +|* |{PASSED, [{SELECT 2 * 3;, OK}]} | +|+ |{PASSED, [{SELECT 1 + 2;, OK}]} | +|- |{PASSED, [{SELECT 2 - 1;, OK}]} | +|/ |{PASSED, [{SELECT 3 / 2;, OK}]} | +|< |{PASSED, [{SELECT 1 < 2;, OK}]} | +|<= |{PASSED, [{SELECT 2 <= 2;, OK}]} | +|<=> |{PASSED, [{SELECT 2 <=> 2;, OK}]} | +|= |{PASSED, [{SELECT 2 = 2;, OK}]} | +|== |{PASSED, [{SELECT 2 == 2;, OK}]} | +|> |{PASSED, [{SELECT 2 > 1;, OK}]} | +|>= |{PASSED, [{SELECT 2 >= 1;, OK}]} | +|^ |{PASSED, [{SELECT 3 ^ 5;, OK}]} | +|abs |{PASSED, [{SELECT abs(-1);, OK}]} | +|acos |{PASSED, [{SELECT acos(1);, OK}]} | +|acosh |{PASSED, [{SELECT acosh(1);, OK}]} | +|add_months |{PASSED, [{SELECT add_months('2016-08-31', 1);, OK}]} | +|aes_decrypt |{PASSED, [{SELECT aes_decrypt(unhex('83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94'), '0000111122223333');, OK}]} | +|aes_encrypt |{FAILED, [{SELECT hex(aes_encrypt('Spark', '0000111122223333'));, Failed on something else. Check query manually}]} | +|aggregate |{FAILED, [{SELECT aggregate(array(1, 2, 3), 0, (acc, x) -> acc + x);, Unsupported}]} | +|and |{PASSED, [{SELECT true and true;, OK}]} | +|any |{FAILED, [{SELECT any(col) FROM VALUES (true), (false), (false) AS tab(col);, Unsupported}]} | +|any_value |{FAILED, [{SELECT any_value(col) FROM VALUES (10), (5), (20) AS tab(col);, Unsupported}]} | +|approx_count_distinct |{FAILED, [{SELECT approx_count_distinct(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1);, Unsupported}]} | +|approx_percentile |{FAILED, [{SELECT approx_percentile(col, array(0.5, 0.4, 0.1), 100) FROM VALUES (0), (1), (2), (10) AS tab(col);, Unsupported}]} | +|array |{FAILED, [{SELECT array(1, 2, 3);, Unsupported}]} | +|array_agg |{FAILED, [{SELECT array_agg(col) FROM VALUES (1), (2), (1) AS tab(col);, Unsupported}]} | +|array_append |{FAILED, [{SELECT array_append(array('b', 'd', 'c', 'a'), 'd');, Unsupported}]} | +|array_compact |{FAILED, [{SELECT array_compact(array(1, 2, 3, null));, Unsupported}]} | +|array_contains |{PASSED, [{SELECT array_contains(array(1, 2, 3), 2);, OK}]} | +|array_distinct |{FAILED, [{SELECT array_distinct(array(1, 2, 3, null, 3));, Unsupported}]} | +|array_except |{FAILED, [{SELECT array_except(array(1, 2, 3), array(1, 3, 5));, Unsupported}]} | +|array_insert |{FAILED, [{SELECT array_insert(array(1, 2, 3, 4), 5, 5);, Unsupported}]} | +|array_intersect |{FAILED, [{SELECT array_intersect(array(1, 2, 3), array(1, 3, 5));, Unsupported}]} | +|array_join |{PASSED, [{SELECT array_join(array('hello', 'world'), ' ');, OK}]} | +|array_max |{PASSED, [{SELECT array_max(array(1, 20, null, 3));, OK}]} | +|array_min |{PASSED, [{SELECT array_min(array(1, 20, null, 3));, OK}]} | +|array_position |{PASSED, [{SELECT array_position(array(3, 2, 1), 1);, OK}]} | +|array_remove |{FAILED, [{SELECT array_remove(array(1, 2, 3, null, 3), 3);, Unsupported}]} | +|array_repeat |{FAILED, [{SELECT array_repeat('123', 2);, Unsupported}]} | +|array_size |{PASSED, [{SELECT array_size(array('b', 'd', 'c', 'a'));, OK}]} | +|array_sort |{FAILED, [{SELECT array_sort(array(5, 6, 1), (left, right) -> case when left < right then -1 when left > right then 1 else 0 end);, Unsupported}]} | +|array_union |{FAILED, [{SELECT array_union(array(1, 2, 3), array(1, 3, 5));, Unsupported}]} | +|arrays_overlap |{PASSED, [{SELECT arrays_overlap(array(1, 2, 3), array(3, 4, 5));, OK}]} | +|arrays_zip |{FAILED, [{SELECT arrays_zip(array(1, 2, 3), array(2, 3, 4));, Unsupported}]} | +|ascii |{PASSED, [{SELECT ascii('222');, OK}]} | +|asin |{PASSED, [{SELECT asin(0);, OK}]} | +|asinh |{PASSED, [{SELECT asinh(0);, OK}]} | +|assert_true |{PASSED, [{SELECT assert_true(0 < 1);, OK}]} | +|atan |{PASSED, [{SELECT atan(0);, OK}]} | +|atan2 |{PASSED, [{SELECT atan2(0, 0);, OK}]} | +|atanh |{PASSED, [{SELECT atanh(0);, OK}]} | +|avg |{FAILED, [{SELECT avg(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|base64 |{PASSED, [{SELECT base64('Spark SQL');, OK}]} | +|bigint |{SKIPPED, []} | +|bin |{PASSED, [{SELECT bin(13);, OK}]} | +|binary |{SKIPPED, []} | +|bit_and |{FAILED, [{SELECT bit_and(col) FROM VALUES (3), (5) AS tab(col);, Unsupported}]} | +|bit_count |{PASSED, [{SELECT bit_count(0);, OK}]} | +|bit_get |{PASSED, [{SELECT bit_get(11, 0);, OK}]} | +|bit_length |{PASSED, [{SELECT bit_length('Spark SQL');, OK}]} | +|bit_or |{FAILED, [{SELECT bit_or(col) FROM VALUES (3), (5) AS tab(col);, Unsupported}]} | +|bit_xor |{FAILED, [{SELECT bit_xor(col) FROM VALUES (3), (5) AS tab(col);, Unsupported}]} | +|bool_and |{FAILED, [{SELECT bool_and(col) FROM VALUES (true), (true), (true) AS tab(col);, Unsupported}]} | +|bool_or |{FAILED, [{SELECT bool_or(col) FROM VALUES (true), (false), (false) AS tab(col);, Unsupported}]} | +|boolean |{SKIPPED, []} | +|bround |{PASSED, [{SELECT bround(2.5, 0);, OK}]} | +|btrim |{PASSED, [{SELECT btrim(' SparkSQL ');, OK}]} | +|cardinality |{PASSED, [{SELECT cardinality(array('b', 'd', 'c', 'a'));, OK}]} | +|cast |{PASSED, [{SELECT cast('10' as int);, OK}]} | +|cbrt |{PASSED, [{SELECT cbrt(27.0);, OK}]} | +|ceil |{PASSED, [{SELECT ceil(-0.1);, OK}]} | +|ceiling |{PASSED, [{SELECT ceiling(-0.1);, OK}]} | +|char |{PASSED, [{SELECT char(65);, OK}]} | +|char_length |{PASSED, [{SELECT char_length('Spark SQL ');, OK}]} | +|character_length |{PASSED, [{SELECT character_length('Spark SQL ');, OK}]} | +|chr |{PASSED, [{SELECT chr(65);, OK}]} | +|coalesce |{PASSED, [{SELECT coalesce(NULL, 1, NULL);, OK}]} | +|collect_list |{FAILED, [{SELECT collect_list(col) FROM VALUES (1), (2), (1) AS tab(col);, Unsupported}]} | +|collect_set |{FAILED, [{SELECT collect_set(col) FROM VALUES (1), (2), (1) AS tab(col);, Unsupported}]} | +|concat |{PASSED, [{SELECT concat('Spark', 'SQL');, OK}]} | +|concat_ws |{PASSED, [{SELECT concat_ws(' ', 'Spark', 'SQL');, OK}]} | +|contains |{PASSED, [{SELECT contains('Spark SQL', 'Spark');, OK}]} | +|conv |{PASSED, [{SELECT conv('100', 2, 10);, OK}]} | +|convert_timezone |{FAILED, [{SELECT convert_timezone('Europe/Brussels', 'America/Los_Angeles', timestamp_ntz'2021-12-06 00:00:00');, Failed on native side}]} | +|corr |{FAILED, [{SELECT corr(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2);, Unsupported}]} | +|cos |{PASSED, [{SELECT cos(0);, OK}]} | +|cosh |{PASSED, [{SELECT cosh(0);, OK}]} | +|cot |{PASSED, [{SELECT cot(1);, OK}]} | +|count |{FAILED, [{SELECT count(*) FROM VALUES (NULL), (5), (5), (20) AS tab(col);, Unsupported}]} | +|count_if |{FAILED, [{SELECT count_if(col % 2 = 0) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col);, Unsupported}]} | +|count_min_sketch |{FAILED, [{SELECT hex(count_min_sketch(col, 0.5d, 0.5d, 1)) FROM VALUES (1), (2), (1) AS tab(col);, Unsupported}]} | +|covar_pop |{FAILED, [{SELECT covar_pop(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2);, Unsupported}]} | +|covar_samp |{FAILED, [{SELECT covar_samp(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2);, Unsupported}]} | +|crc32 |{PASSED, [{SELECT crc32('Spark');, OK}]} | +|csc |{PASSED, [{SELECT csc(1);, OK}]} | +|cume_dist |{FAILED, [{SELECT a, b, cume_dist() OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|curdate |{PASSED, [{SELECT curdate();, OK}]} | +|current_catalog |{PASSED, [{SELECT current_catalog();, OK}]} | +|current_database |{PASSED, [{SELECT current_database();, OK}]} | +|current_date |{PASSED, [{SELECT current_date();, OK}]} | +|current_schema |{PASSED, [{SELECT current_schema();, OK}]} | +|current_timestamp |{FAILED, [{SELECT current_timestamp();, Failed on something else. Check query manually}]} | +|current_timezone |{PASSED, [{SELECT current_timezone();, OK}]} | +|current_user |{PASSED, [{SELECT current_user();, OK}]} | +|date |{SKIPPED, []} | +|date_add |{PASSED, [{SELECT date_add('2016-07-30', 1);, OK}]} | +|date_diff |{PASSED, [{SELECT date_diff('2009-07-31', '2009-07-30');, OK}]} | +|date_format |{PASSED, [{SELECT date_format('2016-04-08', 'y');, OK}]} | +|date_from_unix_date |{PASSED, [{SELECT date_from_unix_date(1);, OK}]} | +|date_part |{PASSED, [{SELECT date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456');, OK}]} | +|date_sub |{PASSED, [{SELECT date_sub('2016-07-30', 1);, OK}]} | +|date_trunc |{PASSED, [{SELECT date_trunc('YEAR', '2015-03-05T09:32:05.359');, OK}]} | +|dateadd |{PASSED, [{SELECT dateadd('2016-07-30', 1);, OK}]} | +|datediff |{PASSED, [{SELECT datediff('2009-07-31', '2009-07-30');, OK}]} | +|datepart |{PASSED, [{SELECT datepart('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456');, OK}]} | +|day |{PASSED, [{SELECT day('2009-07-30');, OK}]} | +|dayofmonth |{PASSED, [{SELECT dayofmonth('2009-07-30');, OK}]} | +|dayofweek |{PASSED, [{SELECT dayofweek('2009-07-30');, OK}]} | +|dayofyear |{PASSED, [{SELECT dayofyear('2016-04-09');, OK}]} | +|decimal |{SKIPPED, []} | +|decode |{PASSED, [{SELECT decode(encode('abc', 'utf-8'), 'utf-8');, OK}]} | +|degrees |{PASSED, [{SELECT degrees(3.141592653589793);, OK}]} | +|dense_rank |{FAILED, [{SELECT a, b, dense_rank(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|div |{PASSED, [{SELECT 3 div 2;, OK}]} | +|double |{SKIPPED, []} | +|e |{PASSED, [{SELECT e();, OK}]} | +|element_at |{PASSED, [{SELECT element_at(array(1, 2, 3), 2);, OK}]} | +|elt |{FAILED, [{SELECT elt(1, 'scala', 'java');, Unsupported}]} | +|encode |{PASSED, [{SELECT encode('abc', 'utf-8');, OK}]} | +|endswith |{PASSED, [{SELECT endswith('Spark SQL', 'SQL');, OK}]} | +|equal_null |{PASSED, [{SELECT equal_null(3, 3);, OK}]} | +|every |{FAILED, [{SELECT every(col) FROM VALUES (true), (true), (true) AS tab(col);, Unsupported}]} | +|exists |{FAILED, [{SELECT exists(array(1, 2, 3), x -> x % 2 == 0);, Unsupported}]} | +|exp |{PASSED, [{SELECT exp(0);, OK}]} | +|explode |{FAILED, [{SELECT explode(array(10, 20));, Unsupported}]} | +|explode_outer |{FAILED, [{SELECT explode_outer(array(10, 20));, Unsupported}]} | +|expm1 |{PASSED, [{SELECT expm1(0);, OK}]} | +|extract |{PASSED, [{SELECT extract(YEAR FROM TIMESTAMP '2019-08-12 01:00:00.123456');, OK}]} | +|factorial |{PASSED, [{SELECT factorial(5);, OK}]} | +|filter |{FAILED, [{SELECT filter(array(1, 2, 3), x -> x % 2 == 1);, Unsupported}]} | +|find_in_set |{PASSED, [{SELECT find_in_set('ab','abc,b,ab,c,def');, OK}]} | +|first |{FAILED, [{SELECT first(col) FROM VALUES (10), (5), (20) AS tab(col);, Unsupported}]} | +|first_value |{FAILED, [{SELECT first_value(col) FROM VALUES (10), (5), (20) AS tab(col);, Unsupported}]} | +|flatten |{FAILED, [{SELECT flatten(array(array(1, 2), array(3, 4)));, Unsupported}]} | +|float |{SKIPPED, []} | +|floor |{PASSED, [{SELECT floor(-0.1);, OK}]} | +|forall |{FAILED, [{SELECT forall(array(1, 2, 3), x -> x % 2 == 0);, Unsupported}]} | +|format_number |{PASSED, [{SELECT format_number(12332.123456, 4);, OK}]} | +|format_string |{PASSED, [{SELECT format_string("Hello World %d %s", 100, "days");, OK}]} | +|from_csv |{FAILED, [{SELECT from_csv('1, 0.8', 'a INT, b DOUBLE');, Unsupported}]} | +|from_json |{FAILED, [{SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE');, Unsupported}]} | +|from_unixtime |{PASSED, [{SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ss');, OK}]} | +|from_utc_timestamp |{PASSED, [{SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');, OK}]} | +|get |{PASSED, [{SELECT get(array(1, 2, 3), 0);, OK}]} | +|get_json_object |{PASSED, [{SELECT get_json_object('{"a":"b"}', '$.a');, OK}]} | +|getbit |{PASSED, [{SELECT getbit(11, 0);, OK}]} | +|greatest |{PASSED, [{SELECT greatest(10, 9, 2, 4, 3);, OK}]} | +|grouping |{FAILED, [{SELECT name, grouping(name), sum(age) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name);, Failed on something else. Check query manually}]} | +|grouping_id |{FAILED, [{SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height);, Failed on something else. Check query manually}]} | +|hash |{PASSED, [{SELECT hash('Spark', array(123), 2);, OK}]} | +|hex |{PASSED, [{SELECT hex(17);, OK}]} | +|histogram_numeric |{FAILED, [{SELECT histogram_numeric(col, 5) FROM VALUES (0), (1), (2), (10) AS tab(col);, Unsupported}]} | +|hour |{PASSED, [{SELECT hour('2009-07-30 12:58:59');, OK}]} | +|hypot |{PASSED, [{SELECT hypot(3, 4);, OK}]} | +|if |{PASSED, [{SELECT if(1 < 2, 'a', 'b');, OK}]} | +|ifnull |{FAILED, [{SELECT ifnull(NULL, array('2'));, Unsupported}]} | +|ilike |{PASSED, [{SELECT ilike('Spark', '_Park');, OK}]} | +|in |{PASSED, [{SELECT 1 in(1, 2, 3);, OK}]} | +|initcap |{PASSED, [{SELECT initcap('sPark sql');, OK}]} | +|inline |{FAILED, [{SELECT inline(array(struct(1, 'a'), struct(2, 'b')));, Unsupported}]} | +|inline_outer |{FAILED, [{SELECT inline_outer(array(struct(1, 'a'), struct(2, 'b')));, Unsupported}]} | +|input_file_block_length |{FAILED, [{SELECT input_file_block_length();, Unsupported}]} | +|input_file_block_start |{FAILED, [{SELECT input_file_block_start();, Unsupported}]} | +|input_file_name |{FAILED, [{SELECT input_file_name();, Unsupported}]} | +|instr |{PASSED, [{SELECT instr('SparkSQL', 'SQL');, OK}]} | +|int |{SKIPPED, []} | +|isnan |{PASSED, [{SELECT isnan(cast('NaN' as double));, OK}]} | +|isnotnull |{PASSED, [{SELECT isnotnull(1);, OK}]} | +|isnull |{PASSED, [{SELECT isnull(1);, OK}]} | +|java_method |{FAILED, [{SELECT java_method('java.util.UUID', 'randomUUID');, Unsupported}]} | +|json_array_length |{PASSED, [{SELECT json_array_length('[1,2,3,4]');, OK}]} | +|json_object_keys |{FAILED, [{SELECT json_object_keys('{}');, Unsupported}]} | +|json_tuple |{FAILED, [{SELECT json_tuple('{"a":1, "b":2}', 'a', 'b');, Unsupported}]} | +|kurtosis |{FAILED, [{SELECT kurtosis(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col);, Unsupported}]} | +|lag |{FAILED, [{SELECT a, b, lag(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|last |{FAILED, [{SELECT last(col) FROM VALUES (10), (5), (20) AS tab(col);, Unsupported}]} | +|last_day |{PASSED, [{SELECT last_day('2009-01-12');, OK}]} | +|last_value |{FAILED, [{SELECT last_value(col) FROM VALUES (10), (5), (20) AS tab(col);, Unsupported}]} | +|lcase |{PASSED, [{SELECT lcase('SparkSql');, OK}]} | +|lead |{FAILED, [{SELECT a, b, lead(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|least |{PASSED, [{SELECT least(10, 9, 2, 4, 3);, OK}]} | +|left |{PASSED, [{SELECT left('Spark SQL', 3);, OK}]} | +|len |{PASSED, [{SELECT len('Spark SQL ');, OK}]} | +|length |{PASSED, [{SELECT length('Spark SQL ');, OK}]} | +|levenshtein |{PASSED, [{SELECT levenshtein('kitten', 'sitting');, OK}]} | +|like |{PASSED, [{SELECT like('Spark', '_park');, OK}]} | +|ln |{PASSED, [{SELECT ln(1);, OK}]} | +|localtimestamp |{FAILED, [{SELECT localtimestamp();, Failed on native side}]} | +|locate |{PASSED, [{SELECT locate('bar', 'foobarbar');, OK}]} | +|log |{PASSED, [{SELECT log(10, 100);, OK}]} | +|log10 |{PASSED, [{SELECT log10(10);, OK}]} | +|log1p |{PASSED, [{SELECT log1p(0);, OK}]} | +|log2 |{PASSED, [{SELECT log2(2);, OK}]} | +|lower |{PASSED, [{SELECT lower('SparkSql');, OK}]} | +|lpad |{PASSED, [{SELECT lpad('hi', 5, '??');, OK}]} | +|ltrim |{PASSED, [{SELECT ltrim(' SparkSQL ');, OK}]} | +|make_date |{PASSED, [{SELECT make_date(2013, 7, 15);, OK}]} | +|make_dt_interval |{FAILED, [{SELECT make_dt_interval(1, 12, 30, 01.001001);, Unsupported}]} | +|make_interval |{FAILED, [{SELECT make_interval(100, 11, 1, 1, 12, 30, 01.001001);, Unsupported}]} | +|make_timestamp |{PASSED, [{SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887);, OK}]} | +|make_timestamp_ltz |{PASSED, [{SELECT make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887);, OK}]} | +|make_timestamp_ntz |{FAILED, [{SELECT make_timestamp_ntz(2014, 12, 28, 6, 30, 45.887);, Failed on native side}]} | +|make_ym_interval |{FAILED, [{SELECT make_ym_interval(1, 2);, Unsupported}]} | +|map |{FAILED, [{SELECT map(1.0, '2', 3.0, '4');, Unsupported}]} | +|map_concat |{FAILED, [{SELECT map_concat(map(1, 'a', 2, 'b'), map(3, 'c'));, Unsupported}]} | +|map_contains_key |{PASSED, [{SELECT map_contains_key(map(1, 'a', 2, 'b'), 1);, OK}]} | +|map_entries |{FAILED, [{SELECT map_entries(map(1, 'a', 2, 'b'));, Unsupported}]} | +|map_filter |{FAILED, [{SELECT map_filter(map(1, 0, 2, 2, 3, -1), (k, v) -> k > v);, Unsupported}]} | +|map_from_arrays |{FAILED, [{SELECT map_from_arrays(array(1.0, 3.0), array('2', '4'));, Unsupported}]} | +|map_from_entries |{FAILED, [{SELECT map_from_entries(array(struct(1, 'a'), struct(2, 'b')));, Unsupported}]} | +|map_keys |{FAILED, [{SELECT map_keys(map(1, 'a', 2, 'b'));, Unsupported}]} | +|map_values |{FAILED, [{SELECT map_values(map(1, 'a', 2, 'b'));, Unsupported}]} | +|map_zip_with |{FAILED, [{SELECT map_zip_with(map(1, 'a', 2, 'b'), map(1, 'x', 2, 'y'), (k, v1, v2) -> concat(v1, v2));, Unsupported}]} | +|mask |{PASSED, [{SELECT mask('abcd-EFGH-8765-4321');, OK}]} | +|max |{FAILED, [{SELECT max(col) FROM VALUES (10), (50), (20) AS tab(col);, Unsupported}]} | +|max_by |{FAILED, [{SELECT max_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y);, Unsupported}]} | +|md5 |{PASSED, [{SELECT md5('Spark');, OK}]} | +|mean |{FAILED, [{SELECT mean(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|median |{FAILED, [{SELECT median(col) FROM VALUES (0), (10) AS tab(col);, Unsupported}]} | +|min |{FAILED, [{SELECT min(col) FROM VALUES (10), (-1), (20) AS tab(col);, Unsupported}]} | +|min_by |{FAILED, [{SELECT min_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y);, Unsupported}]} | +|minute |{PASSED, [{SELECT minute('2009-07-30 12:58:59');, OK}]} | +|mod |{PASSED, [{SELECT 2 % 1.8;, OK}]} | +|mode |{FAILED, [{SELECT mode(col) FROM VALUES (0), (10), (10) AS tab(col);, Unsupported}]} | +|monotonically_increasing_id|{FAILED, [{SELECT monotonically_increasing_id();, Unsupported}]} | +|month |{PASSED, [{SELECT month('2016-07-30');, OK}]} | +|months_between |{PASSED, [{SELECT months_between('1997-02-28 10:30:00', '1996-10-30');, OK}]} | +|named_struct |{FAILED, [{SELECT named_struct("a", 1, "b", 2, "c", 3);, Unsupported}]} | +|nanvl |{PASSED, [{SELECT nanvl(cast('NaN' as double), 123);, OK}]} | +|negative |{PASSED, [{SELECT negative(1);, OK}]} | +|next_day |{PASSED, [{SELECT next_day('2015-01-14', 'TU');, OK}]} | +|not |{PASSED, [{SELECT not true;, OK}]} | +|now |{FAILED, [{SELECT now();, Failed on something else. Check query manually}]} | +|nth_value |{FAILED, [{SELECT a, b, nth_value(b, 2) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|ntile |{FAILED, [{SELECT a, b, ntile(2) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|nullif |{PASSED, [{SELECT nullif(2, 2);, OK}]} | +|nvl |{FAILED, [{SELECT nvl(NULL, array('2'));, Unsupported}]} | +|nvl2 |{PASSED, [{SELECT nvl2(NULL, 2, 1);, OK}]} | +|octet_length |{PASSED, [{SELECT octet_length('Spark SQL');, OK}]} | +|or |{PASSED, [{SELECT true or false;, OK}]} | +|overlay |{PASSED, [{SELECT overlay('Spark SQL' PLACING '_' FROM 6);, OK}]} | +|parse_url |{FAILED, [{SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST');, Unsupported}]} | +|percent_rank |{FAILED, [{SELECT a, b, percent_rank(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|percentile |{FAILED, [{SELECT percentile(col, 0.3) FROM VALUES (0), (10) AS tab(col);, Unsupported}]} | +|percentile_approx |{FAILED, [{SELECT percentile_approx(col, array(0.5, 0.4, 0.1), 100) FROM VALUES (0), (1), (2), (10) AS tab(col);, Unsupported}]} | +|pi |{PASSED, [{SELECT pi();, OK}]} | +|pmod |{PASSED, [{SELECT pmod(10, 3);, OK}]} | +|posexplode |{FAILED, [{SELECT posexplode(array(10,20));, Unsupported}]} | +|posexplode_outer |{FAILED, [{SELECT posexplode_outer(array(10,20));, Unsupported}]} | +|position |{PASSED, [{SELECT position('bar', 'foobarbar');, OK}]} | +|positive |{PASSED, [{SELECT positive(1);, OK}]} | +|pow |{PASSED, [{SELECT pow(2, 3);, OK}]} | +|power |{PASSED, [{SELECT power(2, 3);, OK}]} | +|printf |{PASSED, [{SELECT printf("Hello World %d %s", 100, "days");, OK}]} | +|quarter |{PASSED, [{SELECT quarter('2016-08-31');, OK}]} | +|radians |{PASSED, [{SELECT radians(180);, OK}]} | +|raise_error |{FAILED, [{SELECT raise_error('custom error message');, Unsupported}]} | +|rand |{FAILED, [{SELECT rand();, Unsupported}]} | +|randn |{FAILED, [{SELECT randn();, Unsupported}]} | +|random |{FAILED, [{SELECT random();, Unsupported}]} | +|rank |{FAILED, [{SELECT a, b, rank(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|reduce |{FAILED, [{SELECT reduce(array(1, 2, 3), 0, (acc, x) -> acc + x);, Unsupported}]} | +|reflect |{FAILED, [{SELECT reflect('java.util.UUID', 'randomUUID');, Unsupported}]} | +|regexp |{FAILED, [{SELECT regexp('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*');, Failed on something else. Check query manually}]} | +|regexp_count |{PASSED, [{SELECT regexp_count('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en');, OK}]} | +|regexp_extract |{PASSED, [{SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);, OK}]} | +|regexp_extract_all |{FAILED, [{SELECT regexp_extract_all('100-200, 300-400', '(\\d+)-(\\d+)', 1);, Unsupported}]} | +|regexp_instr |{PASSED, [{SELECT regexp_instr('user@spark.apache.org', '@[^.]*');, OK}]} | +|regexp_like |{FAILED, [{SELECT regexp_like('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*');, Failed on something else. Check query manually}]} | +|regexp_replace |{PASSED, [{SELECT regexp_replace('100-200', '(\\d+)', 'num');, OK}]} | +|regexp_substr |{PASSED, [{SELECT regexp_substr('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en');, OK}]} | +|regr_avgx |{FAILED, [{SELECT regr_avgx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|regr_avgy |{FAILED, [{SELECT regr_avgy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|regr_count |{FAILED, [{SELECT regr_count(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|regr_intercept |{FAILED, [{SELECT regr_intercept(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x);, Unsupported}]} | +|regr_r2 |{FAILED, [{SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|regr_slope |{FAILED, [{SELECT regr_slope(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x);, Unsupported}]} | +|regr_sxx |{FAILED, [{SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|regr_sxy |{FAILED, [{SELECT regr_sxy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|regr_syy |{FAILED, [{SELECT regr_syy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);, Unsupported}]} | +|repeat |{PASSED, [{SELECT repeat('123', 2);, OK}]} | +|replace |{PASSED, [{SELECT replace('ABCabc', 'abc', 'DEF');, OK}]} | +|reverse |{PASSED, [{SELECT reverse('Spark SQL');, OK}]} | +|right |{PASSED, [{SELECT right('Spark SQL', 3);, OK}]} | +|rint |{PASSED, [{SELECT rint(12.3456);, OK}]} | +|rlike |{FAILED, [{SELECT rlike('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*');, Failed on something else. Check query manually}]} | +|round |{PASSED, [{SELECT round(2.5, 0);, OK}]} | +|row_number |{FAILED, [{SELECT a, b, row_number() OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);, Unsupported}]} | +|rpad |{PASSED, [{SELECT rpad('hi', 5, '??');, OK}]} | +|rtrim |{PASSED, [{SELECT rtrim(' SparkSQL ');, OK}]} | +|schema_of_csv |{PASSED, [{SELECT schema_of_csv('1,abc');, OK}]} | +|schema_of_json |{PASSED, [{SELECT schema_of_json('[{"col":0}]');, OK}]} | +|sec |{PASSED, [{SELECT sec(0);, OK}]} | +|second |{PASSED, [{SELECT second('2009-07-30 12:58:59');, OK}]} | +|sentences |{FAILED, [{SELECT sentences('Hi there! Good morning.');, Unsupported}]} | +|sequence |{FAILED, [{SELECT sequence(1, 5);, Unsupported}]} | +|session_window |{FAILED, [{SELECT a, session_window.start, session_window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:10:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, session_window(b, '5 minutes') ORDER BY a, start;, Failed on something else. Check query manually}]} | +|sha |{PASSED, [{SELECT sha('Spark');, OK}]} | +|sha1 |{PASSED, [{SELECT sha1('Spark');, OK}]} | +|sha2 |{PASSED, [{SELECT sha2('Spark', 256);, OK}]} | +|shiftleft |{PASSED, [{SELECT shiftleft(2, 1);, OK}]} | +|shiftright |{PASSED, [{SELECT shiftright(4, 1);, OK}]} | +|shiftrightunsigned |{PASSED, [{SELECT shiftrightunsigned(4, 1);, OK}]} | +|shuffle |{FAILED, [{SELECT shuffle(array(1, 20, 3, 5));, Unsupported}]} | +|sign |{PASSED, [{SELECT sign(40);, OK}]} | +|signum |{PASSED, [{SELECT signum(40);, OK}]} | +|sin |{PASSED, [{SELECT sin(0);, OK}]} | +|sinh |{PASSED, [{SELECT sinh(0);, OK}]} | +|size |{PASSED, [{SELECT size(array('b', 'd', 'c', 'a'));, OK}]} | +|skewness |{FAILED, [{SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col);, Unsupported}]} | +|slice |{FAILED, [{SELECT slice(array(1, 2, 3, 4), 2, 2);, Unsupported}]} | +|smallint |{SKIPPED, []} | +|some |{FAILED, [{SELECT some(col) FROM VALUES (true), (false), (false) AS tab(col);, Unsupported}]} | +|sort_array |{FAILED, [{SELECT sort_array(array('b', 'd', null, 'c', 'a'), true);, Unsupported}]} | +|soundex |{PASSED, [{SELECT soundex('Miller');, OK}]} | +|space |{PASSED, [{SELECT concat(space(2), '1');, OK}]} | +|spark_partition_id |{FAILED, [{SELECT spark_partition_id();, Unsupported}]} | +|split |{FAILED, [{SELECT split('oneAtwoBthreeC', '[ABC]');, Unsupported}]} | +|split_part |{PASSED, [{SELECT split_part('11.12.13', '.', 3);, OK}]} | +|sqrt |{PASSED, [{SELECT sqrt(4);, OK}]} | +|stack |{FAILED, [{SELECT stack(2, 1, 2, 3);, Unsupported}]} | +|startswith |{PASSED, [{SELECT startswith('Spark SQL', 'Spark');, OK}]} | +|std |{FAILED, [{SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|stddev |{FAILED, [{SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|stddev_pop |{FAILED, [{SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|stddev_samp |{FAILED, [{SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|str_to_map |{FAILED, [{SELECT str_to_map('a:1,b:2,c:3', ',', ':');, Unsupported}]} | +|string |{SKIPPED, []} | +|struct |{FAILED, [{SELECT struct(1, 2, 3);, Unsupported}]} | +|substr |{PASSED, [{SELECT substr('Spark SQL', 5);, OK}]} | +|substring |{PASSED, [{SELECT substring('Spark SQL', 5);, OK}]} | +|substring_index |{PASSED, [{SELECT substring_index('www.apache.org', '.', 2);, OK}]} | +|sum |{FAILED, [{SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col);, Unsupported}]} | +|tan |{PASSED, [{SELECT tan(0);, OK}]} | +|tanh |{PASSED, [{SELECT tanh(0);, OK}]} | +|timestamp |{SKIPPED, []} | +|timestamp_micros |{PASSED, [{SELECT timestamp_micros(1230219000123123);, OK}]} | +|timestamp_millis |{PASSED, [{SELECT timestamp_millis(1230219000123);, OK}]} | +|timestamp_seconds |{PASSED, [{SELECT timestamp_seconds(1230219000);, OK}]} | +|tinyint |{SKIPPED, []} | +|to_binary |{PASSED, [{SELECT to_binary('abc', 'utf-8');, OK}]} | +|to_char |{PASSED, [{SELECT to_char(454, '999');, OK}]} | +|to_csv |{PASSED, [{SELECT to_csv(named_struct('a', 1, 'b', 2));, OK}]} | +|to_date |{PASSED, [{SELECT to_date('2009-07-30 04:17:52');, OK}]} | +|to_json |{PASSED, [{SELECT to_json(named_struct('a', 1, 'b', 2));, OK}]} | +|to_number |{PASSED, [{SELECT to_number('454', '999');, OK}]} | +|to_timestamp |{PASSED, [{SELECT to_timestamp('2016-12-31 00:12:00');, OK}]} | +|to_timestamp_ltz |{PASSED, [{SELECT to_timestamp_ltz('2016-12-31 00:12:00');, OK}]} | +|to_timestamp_ntz |{FAILED, [{SELECT to_timestamp_ntz('2016-12-31 00:12:00');, Failed on native side}]} | +|to_unix_timestamp |{PASSED, [{SELECT to_unix_timestamp('2016-04-08', 'yyyy-MM-dd');, OK}]} | +|to_utc_timestamp |{PASSED, [{SELECT to_utc_timestamp('2016-08-31', 'Asia/Seoul');, OK}]} | +|transform |{FAILED, [{SELECT transform(array(1, 2, 3), x -> x + 1);, Unsupported}]} | +|transform_keys |{FAILED, [{SELECT transform_keys(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + 1);, Unsupported}]} | +|transform_values |{FAILED, [{SELECT transform_values(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> v + 1);, Unsupported}]} | +|translate |{PASSED, [{SELECT translate('AaBbCc', 'abc', '123');, OK}]} | +|trim |{PASSED, [{SELECT trim(' SparkSQL ');, OK}]} | +|trunc |{PASSED, [{SELECT trunc('2019-08-04', 'week');, OK}]} | +|try_add |{PASSED, [{SELECT try_add(1, 2);, OK}]} | +|try_avg |{FAILED, [{SELECT try_avg(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|try_divide |{PASSED, [{SELECT try_divide(3, 2);, OK}]} | +|try_element_at |{PASSED, [{SELECT try_element_at(array(1, 2, 3), 2);, OK}]} | +|try_multiply |{PASSED, [{SELECT try_multiply(2, 3);, OK}]} | +|try_subtract |{PASSED, [{SELECT try_subtract(2, 1);, OK}]} | +|try_sum |{FAILED, [{SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col);, Unsupported}]} | +|try_to_binary |{PASSED, [{SELECT try_to_binary('abc', 'utf-8');, OK}]} | +|try_to_number |{PASSED, [{SELECT try_to_number('454', '999');, OK}]} | +|try_to_timestamp |{PASSED, [{SELECT try_to_timestamp('2016-12-31 00:12:00');, OK}]} | +|typeof |{PASSED, [{SELECT typeof(1);, OK}]} | +|ucase |{PASSED, [{SELECT ucase('SparkSql');, OK}]} | +|unbase64 |{PASSED, [{SELECT unbase64('U3BhcmsgU1FM');, OK}]} | +|unhex |{PASSED, [{SELECT decode(unhex('537061726B2053514C'), 'UTF-8');, OK}]} | +|unix_date |{PASSED, [{SELECT unix_date(DATE("1970-01-02"));, OK}]} | +|unix_micros |{PASSED, [{SELECT unix_micros(TIMESTAMP('1970-01-01 00:00:01Z'));, OK}]} | +|unix_millis |{PASSED, [{SELECT unix_millis(TIMESTAMP('1970-01-01 00:00:01Z'));, OK}]} | +|unix_seconds |{PASSED, [{SELECT unix_seconds(TIMESTAMP('1970-01-01 00:00:01Z'));, OK}]} | +|unix_timestamp |{PASSED, [{SELECT unix_timestamp();, OK}]} | +|upper |{PASSED, [{SELECT upper('SparkSql');, OK}]} | +|url_decode |{PASSED, [{SELECT url_decode('https%3A%2F%2Fspark.apache.org');, OK}]} | +|url_encode |{PASSED, [{SELECT url_encode('https://spark.apache.org');, OK}]} | +|user |{PASSED, [{SELECT user();, OK}]} | +|uuid |{FAILED, [{SELECT uuid();, Unsupported}]} | +|var_pop |{FAILED, [{SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|var_samp |{FAILED, [{SELECT var_samp(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|variance |{FAILED, [{SELECT variance(col) FROM VALUES (1), (2), (3) AS tab(col);, Unsupported}]} | +|version |{PASSED, [{SELECT version();, OK}]} | +|weekday |{PASSED, [{SELECT weekday('2009-07-30');, OK}]} | +|weekofyear |{PASSED, [{SELECT weekofyear('2008-02-20');, OK}]} | +|when |{PASSED, [{SELECT CASE WHEN 1 > 0 THEN 1 WHEN 2 > 0 THEN 2.0 ELSE 1.2 END;, OK}]} | +|width_bucket |{PASSED, [{SELECT width_bucket(5.3, 0.2, 10.6, 5);, OK}]} | +|window_time |{FAILED, [{SELECT a, window.start as start, window.end as end, window_time(window), cnt FROM (SELECT a, window, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, window.start);, Failed on something else. Check query manually}]}| +|xpath |{FAILED, [{SELECT xpath('b1b2b3c1c2','a/b/text()');, Unsupported}]} | +|xpath_boolean |{PASSED, [{SELECT xpath_boolean('1','a/b');, OK}]} | +|xpath_double |{PASSED, [{SELECT xpath_double('12', 'sum(a/b)');, OK}]} | +|xpath_float |{PASSED, [{SELECT xpath_float('12', 'sum(a/b)');, OK}]} | +|xpath_int |{PASSED, [{SELECT xpath_int('12', 'sum(a/b)');, OK}]} | +|xpath_long |{PASSED, [{SELECT xpath_long('12', 'sum(a/b)');, OK}]} | +|xpath_number |{PASSED, [{SELECT xpath_number('12', 'sum(a/b)');, OK}]} | +|xpath_short |{PASSED, [{SELECT xpath_short('12', 'sum(a/b)');, OK}]} | +|xpath_string |{PASSED, [{SELECT xpath_string('bcc','a/c');, OK}]} | +|xxhash64 |{PASSED, [{SELECT xxhash64('Spark', array(123), 2);, OK}]} | +|year |{PASSED, [{SELECT year('2016-07-30');, OK}]} | +|zip_with |{FAILED, [{SELECT zip_with(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x));, Unsupported}]} | +|| |{PASSED, [{SELECT 3 | 5;, OK}]} | +|~ |{PASSED, [{SELECT ~ 0;, OK}]} | ++---------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/doc/spark_coverage_agg.txt b/doc/spark_coverage_agg.txt new file mode 100644 index 000000000..5c5da67ad --- /dev/null +++ b/doc/spark_coverage_agg.txt @@ -0,0 +1,9 @@ ++-------+----------------------------------------------+---+ +|result |reason |cnt| ++-------+----------------------------------------------+---+ +|FAILED |Unsupported |137| +|FAILED |Failed on native side |4 | +|PASSED |OK |254| +|SKIPPED|null |12 | +|FAILED |Failed on something else. Check query manually|10 | ++-------+----------------------------------------------+---+ diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionCoverageSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionCoverageSuite.scala new file mode 100644 index 000000000..5b20f15ec --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/CometExpressionCoverageSuite.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Paths} +import scala.collection.mutable +import org.scalatest.exceptions.TestFailedException +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.scalatest.Ignore + +/** + * Manual test to calculate Spark builtin functions coverage support by the Comet + * + * The test will update files doc/spark_coverage.txt, doc/spark_coverage_agg.txt + */ + +@Ignore +class CometExpressionCoverageSuite extends CometTestBase with AdaptiveSparkPlanHelper { + + import testImplicits._ + + test("Test Spark builtin functions coverage") { + val queryPattern = """(?i)SELECT (.+?);""".r + val valuesPattern = """(?i)FROM VALUES(.+?);""".r + val selectPattern = """(i?)SELECT(.+?)FROM""".r + val builtinExamplesMap = spark.sessionState.functionRegistry + .listFunction() + .map(spark.sessionState.catalog.lookupFunctionInfo(_)) + .filter(_.getSource.toLowerCase == "built-in") + .filter(f => + !List("window").contains(f.getName.toLowerCase)) // exclude exotics, will run it manually + .map(f => { + val selectRows = queryPattern.findAllMatchIn(f.getExamples).map(_.group(0)).toList + (f.getName, selectRows.filter(_.nonEmpty)) + }) + .toMap + + // key - function name + // value - list of result shows if function supported by Comet + val resultsMap = new mutable.HashMap[String, CoverageResult]() + + builtinExamplesMap.foreach { + case (funcName, q :: _) => + val queryResult = + try { + // Example with predefined values + // e.g. SELECT bit_xor(col) FROM VALUES (3), (5) AS tab(col) + // better option is probably to parse the query and iterate through expressions + // but this is adhoc coverage test + if (q.toLowerCase.contains(" from values")) { + val select = selectPattern.findFirstMatchIn(q).map(_.group(0)) + val values = valuesPattern.findFirstMatchIn(q).map(_.group(0)) + (select, values) match { + case (Some(s), Some(v)) => + testSingleLineQuery(s"select * $v", s"$s tbl") + + case _ => sys.error(s"Query $q cannot be parsed properly") + } + } else { + // Plain example like SELECT cos(0); + testSingleLineQuery("select 'dummy' x", s"${q.dropRight(1)}, x from tbl") + } + CoverageResult("PASSED", Seq((q, "OK"))) + } catch { + case e: TestFailedException + if e.message.getOrElse("").contains("Expected only Comet native operators") => + CoverageResult("FAILED", Seq((q, "Unsupported"))) + case e if e.getMessage.contains("CometNativeException") => + CoverageResult("FAILED", Seq((q, "Failed on native side"))) + case _ => + CoverageResult("FAILED", Seq((q, "Failed on something else. Check query manually"))) + } + resultsMap.put(funcName, queryResult) + case (funcName, List()) => + resultsMap.put(funcName, CoverageResult("SKIPPED", Seq.empty)) + } + + // later we Convert resultMap into some HTML + resultsMap.toSeq.toDF("name", "details").createOrReplaceTempView("t") + val str_agg = showString( + spark.sql( + "select result, d._2 as reason, count(1) cnt from (select name, t.details.result, explode_outer(t.details.details) as d from t) group by 1, 2"), + 500, + 0) + Files.write(Paths.get("doc/spark_coverage_agg.txt"), str_agg.getBytes(StandardCharsets.UTF_8)) + + val str = showString(spark.sql("select * from t order by 1"), 500, 0) + Files.write(Paths.get("doc/spark_coverage.txt"), str.getBytes(StandardCharsets.UTF_8)) + } +} + +case class CoverageResult(result: String, details: Seq[(String, String)]) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 803f30bed..bbe7edd3c 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -19,15 +19,13 @@ package org.apache.comet -import java.util - import org.apache.hadoop.fs.Path import org.apache.spark.sql.{CometTestBase, DataFrame, Row} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.functions.expr import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.SESSION_LOCAL_TIMEZONE -import org.apache.spark.sql.types.{Decimal, DecimalType, StructType} +import org.apache.spark.sql.types.{Decimal, DecimalType} import org.apache.comet.CometSparkSessionExtensions.{isSpark32, isSpark33Plus, isSpark34Plus} @@ -1291,30 +1289,6 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - // tests one liner query without necessity to create external table - def testSingleLineQuery( - prepareQuery: String, - testQuery: String, - testName: String = "test", - tableName: String = "tbl"): Unit = { - - withTempDir { dir => - val path = new Path(dir.toURI.toString, testName).toUri.toString - var data: java.util.List[Row] = new util.ArrayList() - var schema: StructType = null - - withSQLConf(CometConf.COMET_ENABLED.key -> "false") { - val df = spark.sql(prepareQuery) - data = df.collectAsList() - schema = df.schema - } - - spark.createDataFrame(data, schema).repartition(1).write.parquet(path) - readParquetFile(path, Some(schema)) { df => df.createOrReplaceTempView(tableName) } - checkSparkAnswerAndOperator(testQuery) - } - } - test("Decimal random number tests") { val rand = scala.util.Random def makeNum(p: Int, s: Int): String = { diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala index 6fb81bc43..ff5cb6ec6 100644 --- a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala +++ b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala @@ -718,4 +718,36 @@ abstract class CometTestBase Seq.empty } } + + // tests one liner query without necessity to create external table + def testSingleLineQuery( + prepareQuery: String, + testQuery: String, + testName: String = "test", + tableName: String = "tbl"): Unit = { + + withTempDir { dir => + val path = new Path(dir.toURI.toString, testName).toUri.toString + var data: java.util.List[Row] = new java.util.ArrayList() + var schema: StructType = null + + withSQLConf(CometConf.COMET_ENABLED.key -> "false") { + val df = spark.sql(prepareQuery) + data = df.collectAsList() + schema = df.schema + } + + spark.createDataFrame(data, schema).repartition(1).write.parquet(path) + readParquetFile(path, Some(schema)) { df => df.createOrReplaceTempView(tableName) } + checkSparkAnswerAndOperator(testQuery) + } + } + + def showString[T]( + df: Dataset[T], + _numRows: Int, + truncate: Int = 20, + vertical: Boolean = false): String = { + df.showString(_numRows, truncate, vertical) + } }