From 341c5d46a579d0e83e9e9567cda3ba891bf53065 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Jun 2024 16:39:09 -0600 Subject: [PATCH] Add changelog generator script --- .gitignore | 1 + dev/changelog/0.1.0.md | 344 ++++++++++++++++++++++++++++++ dev/release/README.md | 24 +++ dev/release/generate-changelog.py | 164 ++++++++++++++ dev/release/requirements.in | 1 + 5 files changed, 534 insertions(+) create mode 100644 dev/changelog/0.1.0.md create mode 100755 dev/release/generate-changelog.py create mode 100644 dev/release/requirements.in diff --git a/.gitignore b/.gitignore index 0818ada9b..1c247dd9a 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ rat.txt filtered_rat.txt dev/dist apache-rat-*.jar +venv diff --git a/dev/changelog/0.1.0.md b/dev/changelog/0.1.0.md new file mode 100644 index 000000000..96fedc89c --- /dev/null +++ b/dev/changelog/0.1.0.md @@ -0,0 +1,344 @@ + + +# DataFusion Comet 0.1.0 Changelog + +This release consists of 261 commits from 40 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: Add native shuffle and columnar shuffle [#30](https://github.com/apache/datafusion-comet/pull/30) (viirya) +- feat: Support Emit::First for SumDecimalGroupsAccumulator [#47](https://github.com/apache/datafusion-comet/pull/47) (viirya) +- feat: Nested map support for columnar shuffle [#51](https://github.com/apache/datafusion-comet/pull/51) (viirya) +- feat: Support Count(Distinct) and similar aggregation functions [#42](https://github.com/apache/datafusion-comet/pull/42) (huaxingao) +- feat: Upgrade to `jni-rs` 0.21 [#50](https://github.com/apache/datafusion-comet/pull/50) (sunchao) +- feat: Handle exception thrown from native side [#61](https://github.com/apache/datafusion-comet/pull/61) (sunchao) +- feat: Support InSet expression in Comet [#59](https://github.com/apache/datafusion-comet/pull/59) (viirya) +- feat: Add `CometNativeException` for exceptions thrown from the native side [#62](https://github.com/apache/datafusion-comet/pull/62) (sunchao) +- feat: Add cause to native exception [#63](https://github.com/apache/datafusion-comet/pull/63) (viirya) +- feat: Pull based native execution [#69](https://github.com/apache/datafusion-comet/pull/69) (viirya) +- feat: Add executeColumnarCollectIterator to CometExec to collect Comet operator result [#71](https://github.com/apache/datafusion-comet/pull/71) (viirya) +- feat: Add CometBroadcastExchangeExec to support broadcasting the result of Comet native operator [#80](https://github.com/apache/datafusion-comet/pull/80) (viirya) +- feat: Reduce memory consumption when writing sorted shuffle files [#82](https://github.com/apache/datafusion-comet/pull/82) (sunchao) +- feat: Add struct/map as unsupported map key/value for columnar shuffle [#84](https://github.com/apache/datafusion-comet/pull/84) (viirya) +- feat: Support multiple input sources for CometNativeExec [#87](https://github.com/apache/datafusion-comet/pull/87) (viirya) +- feat: Date and timestamp trunc with format array [#94](https://github.com/apache/datafusion-comet/pull/94) (parthchandra) +- feat: Support `First`/`Last` aggregate functions [#97](https://github.com/apache/datafusion-comet/pull/97) (huaxingao) +- feat: Add support of TakeOrderedAndProjectExec in Comet [#88](https://github.com/apache/datafusion-comet/pull/88) (viirya) +- feat: Support Binary in shuffle writer [#106](https://github.com/apache/datafusion-comet/pull/106) (advancedxy) +- feat: Add license header by spotless:apply automatically [#110](https://github.com/apache/datafusion-comet/pull/110) (advancedxy) +- feat: Add dictionary binary to shuffle writer [#111](https://github.com/apache/datafusion-comet/pull/111) (viirya) +- feat: Minimize number of connections used by parallel reader [#126](https://github.com/apache/datafusion-comet/pull/126) (parthchandra) +- feat: Support CollectLimit operator [#100](https://github.com/apache/datafusion-comet/pull/100) (advancedxy) +- feat: Enable min/max for boolean type [#165](https://github.com/apache/datafusion-comet/pull/165) (huaxingao) +- feat: Introduce `CometTaskMemoryManager` and native side memory pool [#83](https://github.com/apache/datafusion-comet/pull/83) (sunchao) +- feat: Fix old style names [#201](https://github.com/apache/datafusion-comet/pull/201) (comphead) +- feat: enable comet shuffle manager for comet shell [#204](https://github.com/apache/datafusion-comet/pull/204) (zuston) +- feat: Support bitwise aggregate functions [#197](https://github.com/apache/datafusion-comet/pull/197) (huaxingao) +- feat: Support BloomFilterMightContain expr [#179](https://github.com/apache/datafusion-comet/pull/179) (advancedxy) +- feat: Support sort merge join [#178](https://github.com/apache/datafusion-comet/pull/178) (viirya) +- feat: Support HashJoin operator [#194](https://github.com/apache/datafusion-comet/pull/194) (viirya) +- feat: Remove use of nightly int_roundings feature [#228](https://github.com/apache/datafusion-comet/pull/228) (psvri) +- feat: Support Broadcast HashJoin [#211](https://github.com/apache/datafusion-comet/pull/211) (viirya) +- feat: Enable Comet broadcast by default [#213](https://github.com/apache/datafusion-comet/pull/213) (viirya) +- feat: Add CometRowToColumnar operator [#206](https://github.com/apache/datafusion-comet/pull/206) (advancedxy) +- feat: Document the class path / classloader issue with the shuffle manager [#256](https://github.com/apache/datafusion-comet/pull/256) (holdenk) +- feat: Port Datafusion Covariance to Comet [#234](https://github.com/apache/datafusion-comet/pull/234) (huaxingao) +- feat: Add manual test to calculate spark builtin functions coverage [#263](https://github.com/apache/datafusion-comet/pull/263) (comphead) +- feat: Support ANSI mode in CAST from String to Bool [#290](https://github.com/apache/datafusion-comet/pull/290) (andygrove) +- feat: Add extended explain info to Comet plan [#255](https://github.com/apache/datafusion-comet/pull/255) (parthchandra) +- feat: Improve CometSortMergeJoin statistics [#304](https://github.com/apache/datafusion-comet/pull/304) (planga82) +- feat: Add compatibility guide [#316](https://github.com/apache/datafusion-comet/pull/316) (andygrove) +- feat: Improve CometHashJoin statistics [#309](https://github.com/apache/datafusion-comet/pull/309) (planga82) +- feat: Support Variance [#297](https://github.com/apache/datafusion-comet/pull/297) (huaxingao) +- feat: Support murmur3_hash and sha2 family hash functions [#226](https://github.com/apache/datafusion-comet/pull/226) (advancedxy) +- feat: Disable cast string to timestamp by default [#337](https://github.com/apache/datafusion-comet/pull/337) (andygrove) +- feat: Improve CometBroadcastHashJoin statistics [#339](https://github.com/apache/datafusion-comet/pull/339) (planga82) +- feat: Implement Spark-compatible CAST from string to integral types [#307](https://github.com/apache/datafusion-comet/pull/307) (andygrove) +- feat: Implement Spark-compatible CAST from string to timestamp types [#335](https://github.com/apache/datafusion-comet/pull/335) (vaibhawvipul) +- feat: Implement Spark-compatible CAST float/double to string [#346](https://github.com/apache/datafusion-comet/pull/346) (mattharder91) +- feat: Only allow incompatible cast expressions to run in comet if a config is enabled [#362](https://github.com/apache/datafusion-comet/pull/362) (andygrove) +- feat: Implement Spark-compatible CAST between integer types [#340](https://github.com/apache/datafusion-comet/pull/340) (ganeshkumar269) +- feat: Supports Stddev [#348](https://github.com/apache/datafusion-comet/pull/348) (huaxingao) +- feat: Improve cast compatibility tests and docs [#379](https://github.com/apache/datafusion-comet/pull/379) (andygrove) +- feat: Implement Spark-compatible CAST from non-integral numeric types to integral types [#399](https://github.com/apache/datafusion-comet/pull/399) (rohitrastogi) +- feat: Implement Spark unhex [#342](https://github.com/apache/datafusion-comet/pull/342) (tshauck) +- feat: Enable columnar shuffle by default [#250](https://github.com/apache/datafusion-comet/pull/250) (viirya) +- feat: Implement Spark-compatible CAST from floating-point/double to decimal [#384](https://github.com/apache/datafusion-comet/pull/384) (vaibhawvipul) +- feat: Add logging to explain reasons for Comet not being able to run a query stage natively [#397](https://github.com/apache/datafusion-comet/pull/397) (andygrove) +- feat: Add support for TryCast expression in Spark 3.2 and 3.3 [#416](https://github.com/apache/datafusion-comet/pull/416) (vaibhawvipul) +- feat: Supports UUID column [#395](https://github.com/apache/datafusion-comet/pull/395) (huaxingao) +- feat: correlation support [#456](https://github.com/apache/datafusion-comet/pull/456) (huaxingao) +- feat: Implement Spark-compatible CAST from String to Date [#383](https://github.com/apache/datafusion-comet/pull/383) (vidyasankarv) +- feat: Add COMET_SHUFFLE_MODE config to control Comet shuffle mode [#460](https://github.com/apache/datafusion-comet/pull/460) (viirya) +- feat: Add random row generator in data generator [#451](https://github.com/apache/datafusion-comet/pull/451) (advancedxy) +- feat: Add xxhash64 function support [#424](https://github.com/apache/datafusion-comet/pull/424) (advancedxy) +- feat: add hex scalar function [#449](https://github.com/apache/datafusion-comet/pull/449) (tshauck) +- feat: Add "Comet Fuzz" fuzz-testing utility [#472](https://github.com/apache/datafusion-comet/pull/472) (andygrove) +- feat: Use enum to represent CAST eval_mode in expr.proto [#415](https://github.com/apache/datafusion-comet/pull/415) (prashantksharma) +- feat: Implement ANSI support for UnaryMinus [#471](https://github.com/apache/datafusion-comet/pull/471) (vaibhawvipul) +- feat: Add specific fuzz tests for cast and try_cast and fix NPE found during fuzz testing [#514](https://github.com/apache/datafusion-comet/pull/514) (andygrove) +- feat: Add fuzz testing for arithmetic expressions [#519](https://github.com/apache/datafusion-comet/pull/519) (andygrove) +- feat: Add HashJoin support for BuildRight [#437](https://github.com/apache/datafusion-comet/pull/437) (viirya) + +**Fixed bugs:** + +- fix: Comet sink operator should not have children operators [#26](https://github.com/apache/datafusion-comet/pull/26) (viirya) +- fix: Fix the UnionExec match branches in CometExecRule [#68](https://github.com/apache/datafusion-comet/pull/68) (wankunde) +- fix: Appending null values to element array builders of StructBuilder for null row in a StructArray [#78](https://github.com/apache/datafusion-comet/pull/78) (viirya) +- fix: Fix compilation error for CometBroadcastExchangeExec [#86](https://github.com/apache/datafusion-comet/pull/86) (viirya) +- fix: Avoid exception caused by broadcasting empty result [#92](https://github.com/apache/datafusion-comet/pull/92) (wForget) +- fix: Add num_rows when building RecordBatch [#103](https://github.com/apache/datafusion-comet/pull/103) (advancedxy) +- fix: Cast string to boolean not compatible with Spark [#107](https://github.com/apache/datafusion-comet/pull/107) (erenavsarogullari) +- fix: Another attempt to fix libcrypto.dylib loading issue [#112](https://github.com/apache/datafusion-comet/pull/112) (advancedxy) +- fix: Fix compilation error for Spark 3.2 & 3.3 [#117](https://github.com/apache/datafusion-comet/pull/117) (sunchao) +- fix: Fix corrupted AggregateMode when transforming plan parameters [#118](https://github.com/apache/datafusion-comet/pull/118) (viirya) +- fix: bitwise shift with different left/right types [#135](https://github.com/apache/datafusion-comet/pull/135) (viirya) +- fix: Avoid null exception in removeSubquery [#147](https://github.com/apache/datafusion-comet/pull/147) (viirya) +- fix: rat check error in vscode ide [#161](https://github.com/apache/datafusion-comet/pull/161) (thexiay) +- fix: Final aggregation should not bind to the input of partial aggregation [#155](https://github.com/apache/datafusion-comet/pull/155) (viirya) +- fix: coalesce should return correct datatype [#168](https://github.com/apache/datafusion-comet/pull/168) (viirya) +- fix: attempt to divide by zero error on decimal division [#172](https://github.com/apache/datafusion-comet/pull/172) (viirya) +- fix: Aggregation without aggregation expressions should use correct result expressions [#175](https://github.com/apache/datafusion-comet/pull/175) (viirya) +- fix: Comet native operator can be executed after ReusedExchange [#187](https://github.com/apache/datafusion-comet/pull/187) (viirya) +- fix: Try to convert a static list into a set in Rust [#184](https://github.com/apache/datafusion-comet/pull/184) (advancedxy) +- fix: Include active spiller when computing peak shuffle memory [#196](https://github.com/apache/datafusion-comet/pull/196) (sunchao) +- fix: CometExecRule should handle ShuffleQueryStage and ReusedExchange [#186](https://github.com/apache/datafusion-comet/pull/186) (viirya) +- fix: Use `makeCopy` to change relation in `FileSourceScanExec` [#207](https://github.com/apache/datafusion-comet/pull/207) (viirya) +- fix: Remove duplicate byte array allocation for CometDictionary [#224](https://github.com/apache/datafusion-comet/pull/224) (viirya) +- fix: Remove redundant data copy in columnar shuffle [#233](https://github.com/apache/datafusion-comet/pull/233) (viirya) +- fix: Only maps FIXED_LEN_BYTE_ARRAY to String for uuid type [#238](https://github.com/apache/datafusion-comet/pull/238) (huaxingao) +- fix: Reduce RowPartition memory allocation [#244](https://github.com/apache/datafusion-comet/pull/244) (viirya) +- fix: Remove wrong calculation for Murmur3Hash for float with null input [#245](https://github.com/apache/datafusion-comet/pull/245) (advancedxy) +- fix: Deallocate row addresses and size arrays after exporting [#246](https://github.com/apache/datafusion-comet/pull/246) (viirya) +- fix: Fix wrong children expression order in IfExpr [#249](https://github.com/apache/datafusion-comet/pull/249) (viirya) +- fix: Average expression in Comet Final should handle all null inputs from partial Spark aggregation [#261](https://github.com/apache/datafusion-comet/pull/261) (viirya) +- fix: Only trigger Comet Final aggregation on Comet partial aggregation [#264](https://github.com/apache/datafusion-comet/pull/264) (viirya) +- fix: incorrect result on Comet multiple column distinct count [#268](https://github.com/apache/datafusion-comet/pull/268) (viirya) +- fix: Avoid using CometConf [#266](https://github.com/apache/datafusion-comet/pull/266) (snmvaughan) +- fix: Fix arrow error when sorting on empty batch [#271](https://github.com/apache/datafusion-comet/pull/271) (viirya) +- fix: Include license using `#` instead of using XML comment [#274](https://github.com/apache/datafusion-comet/pull/274) (snmvaughan) +- fix: Comet should not translate try_sum to native sum expression [#277](https://github.com/apache/datafusion-comet/pull/277) (viirya) +- fix: incorrect result with aggregate expression with filter [#284](https://github.com/apache/datafusion-comet/pull/284) (viirya) +- fix: Comet should not fail on negative limit parameter [#288](https://github.com/apache/datafusion-comet/pull/288) (viirya) +- fix: Comet columnar shuffle should not be on top of another Comet shuffle operator [#296](https://github.com/apache/datafusion-comet/pull/296) (viirya) +- fix: Iceberg scan transition should be in front of other data source v2 [#302](https://github.com/apache/datafusion-comet/pull/302) (viirya) +- fix: CometExec's outputPartitioning might not be same as Spark expects after AQE interferes [#299](https://github.com/apache/datafusion-comet/pull/299) (viirya) +- fix: CometShuffleExchangeExec logical link should be correct [#324](https://github.com/apache/datafusion-comet/pull/324) (viirya) +- fix: SortMergeJoin with unsupported key type should fall back to Spark [#355](https://github.com/apache/datafusion-comet/pull/355) (viirya) +- fix: limit with offset should return correct results [#359](https://github.com/apache/datafusion-comet/pull/359) (viirya) +- fix: Disable Comet shuffle with AQE coalesce partitions enabled [#380](https://github.com/apache/datafusion-comet/pull/380) (viirya) +- fix: Unknown operator id when explain with formatted mode [#410](https://github.com/apache/datafusion-comet/pull/410) (leoluan2009) +- fix: Reuse CometBroadcastExchangeExec with Spark ReuseExchangeAndSubquery rule [#441](https://github.com/apache/datafusion-comet/pull/441) (viirya) +- fix: newFileScanRDD should not take constructor from custom Spark versions [#412](https://github.com/apache/datafusion-comet/pull/412) (ceppelli) +- fix: fix CometNativeExec.doCanonicalize for ReusedExchangeExec [#447](https://github.com/apache/datafusion-comet/pull/447) (viirya) +- fix: Enable cast string to int tests and fix compatibility issue [#453](https://github.com/apache/datafusion-comet/pull/453) (andygrove) +- fix: Compute murmur3 hash with dictionary input correctly [#433](https://github.com/apache/datafusion-comet/pull/433) (advancedxy) +- fix: Only delegate to DataFusion cast when we know that it is compatible with Spark [#461](https://github.com/apache/datafusion-comet/pull/461) (andygrove) +- fix: `ColumnReader.loadVector` should initiate `CometDictionary` after re-import arrays [#473](https://github.com/apache/datafusion-comet/pull/473) (viirya) +- fix: substring with negative indices should produce correct result [#470](https://github.com/apache/datafusion-comet/pull/470) (sonhmai) +- fix: CometReader.loadVector should not overwrite dictionary ids [#476](https://github.com/apache/datafusion-comet/pull/476) (viirya) +- fix: Reuse previous CometDictionary Java arrays [#489](https://github.com/apache/datafusion-comet/pull/489) (viirya) +- fix: Fallback to Spark for LIKE with custom escape character [#478](https://github.com/apache/datafusion-comet/pull/478) (sujithjay) +- fix: Incorrect input schema when preparing result expressions for HashAggregation [#501](https://github.com/apache/datafusion-comet/pull/501) (viirya) +- fix: Input batch to ShuffleRepartitioner.insert_batch should not be larger than configured batch size [#523](https://github.com/apache/datafusion-comet/pull/523) (viirya) +- fix: Fix integer overflow in date_parser [#529](https://github.com/apache/datafusion-comet/pull/529) (eejbyfeldt) + +**Documentation updates:** + +- docs: Move existing documentation into new Contributor Guide and add Getting Started section [#334](https://github.com/apache/datafusion-comet/pull/334) (andygrove) +- docs: Add more content to the user guide [#347](https://github.com/apache/datafusion-comet/pull/347) (andygrove) +- docs: Generate configuration guide in mvn build [#349](https://github.com/apache/datafusion-comet/pull/349) (andygrove) +- docs: Add a plugin overview page to the contributors guide [#345](https://github.com/apache/datafusion-comet/pull/345) (andygrove) +- docs: fix the docs url of installation instructions [#393](https://github.com/apache/datafusion-comet/pull/393) (haoxins) +- docs: Running ScalaTest suites from the CLI [#404](https://github.com/apache/datafusion-comet/pull/404) (edmondop) +- docs: Remove spark.comet.exec.broadcast.enabled from config docs [#421](https://github.com/apache/datafusion-comet/pull/421) (andygrove) +- docs: fix various sphinx warnings [#428](https://github.com/apache/datafusion-comet/pull/428) (tshauck) +- docs: Update Spark shell command to include setting additional class path [#435](https://github.com/apache/datafusion-comet/pull/435) (andygrove) +- docs: Add benchmarking guide [#444](https://github.com/apache/datafusion-comet/pull/444) (andygrove) +- docs: add guide to adding a new expression [#422](https://github.com/apache/datafusion-comet/pull/422) (tshauck) +- docs: changes in documentation [#512](https://github.com/apache/datafusion-comet/pull/512) (SemyonSinchenko) +- docs: Improve user documentation for supported operators and expressions [#520](https://github.com/apache/datafusion-comet/pull/520) (andygrove) + +**Other:** + +- Initial PR [#1](https://github.com/apache/datafusion-comet/pull/1) (sunchao) +- build: Add Maven wrapper to the project [#13](https://github.com/apache/datafusion-comet/pull/13) (sunchao) +- build: Add basic CI test pipelines [#18](https://github.com/apache/datafusion-comet/pull/18) (sunchao) +- Bump com.google.protobuf:protobuf-java from 3.17.3 to 3.19.6 [#5](https://github.com/apache/datafusion-comet/pull/5) (dependabot[bot]) +- build: Add PR template [#23](https://github.com/apache/datafusion-comet/pull/23) (sunchao) +- build: Create ticket templates [#24](https://github.com/apache/datafusion-comet/pull/24) (comphead) +- build: Re-enable Scala style checker and spotless [#21](https://github.com/apache/datafusion-comet/pull/21) (sunchao) +- build: Remove license header from pull request template [#28](https://github.com/apache/datafusion-comet/pull/28) (viirya) +- build: Exclude .github from apache-rat-plugin check [#32](https://github.com/apache/datafusion-comet/pull/32) (viirya) +- build: Add CI for MacOS (x64 and aarch64) [#35](https://github.com/apache/datafusion-comet/pull/35) (sunchao) +- fix broken link in README.md [#39](https://github.com/apache/datafusion-comet/pull/39) (nairbv) +- test: Add some fuzz testing for cast operations [#16](https://github.com/apache/datafusion-comet/pull/16) (andygrove) +- test: Fix CI failure on libcrypto [#41](https://github.com/apache/datafusion-comet/pull/41) (sunchao) +- test: Reduce test time spent in `CometShuffleSuite` [#40](https://github.com/apache/datafusion-comet/pull/40) (sunchao) +- test: Add test for RoundRobinPartitioning [#54](https://github.com/apache/datafusion-comet/pull/54) (viirya) +- build: Fix potential libcrypto lib loading issue for X86 mac runners [#55](https://github.com/apache/datafusion-comet/pull/55) (advancedxy) +- refactor: Remove a few duplicated occurrences [#53](https://github.com/apache/datafusion-comet/pull/53) (sunchao) +- build: Fix mvn cache for containerized runners [#48](https://github.com/apache/datafusion-comet/pull/48) (advancedxy) +- test: Ensure traversed operators during finding first partial aggregaion are all native [#58](https://github.com/apache/datafusion-comet/pull/58) (viirya) +- build: Upgrade arrow-rs to 50.0.0 and DataFusion to 35.0.0 [#65](https://github.com/apache/datafusion-comet/pull/65) (viirya) +- build: Support built with java 1.8 [#45](https://github.com/apache/datafusion-comet/pull/45) (advancedxy) +- test: Add golden files for TPCDSPlanStabilitySuite [#73](https://github.com/apache/datafusion-comet/pull/73) (sunchao) +- test: Add TPC-DS test results [#77](https://github.com/apache/datafusion-comet/pull/77) (sunchao) +- build: Upgrade spotless version to 2.43.0 [#85](https://github.com/apache/datafusion-comet/pull/85) (viirya) +- test: Expose thrown exception when executing query in CometTPCHQuerySuite [#96](https://github.com/apache/datafusion-comet/pull/96) (viirya) +- test: Enable TPCDS q41 in CometTPCDSQuerySuite [#98](https://github.com/apache/datafusion-comet/pull/98) (viirya) +- build: Add CI for TPCDS queries [#99](https://github.com/apache/datafusion-comet/pull/99) (viirya) +- build: Add tpcds-sf-1 to license header excluded list [#108](https://github.com/apache/datafusion-comet/pull/108) (viirya) +- build: Show time duration for scala test [#116](https://github.com/apache/datafusion-comet/pull/116) (advancedxy) +- test: Move MacOS (x86) pipelines to post-commit [#122](https://github.com/apache/datafusion-comet/pull/122) (sunchao) +- doc: Add Quickstart Comet doc section [#125](https://github.com/apache/datafusion-comet/pull/125) (comphead) +- build: Upgrade DF to 36.0.0 and arrow-rs 50.0.0 [#66](https://github.com/apache/datafusion-comet/pull/66) (comphead) +- doc: Minor fix Getting started reformatting [#128](https://github.com/apache/datafusion-comet/pull/128) (comphead) +- test: Reduce end-to-end test time [#109](https://github.com/apache/datafusion-comet/pull/109) (sunchao) +- build: Separate and speedup TPC-DS benchmark [#130](https://github.com/apache/datafusion-comet/pull/130) (advancedxy) +- build: Re-enable TPCDS queries q34 and q64 in `CometTPCDSQuerySuite` [#133](https://github.com/apache/datafusion-comet/pull/133) (viirya) +- build: Refine names in benchmark.yml [#132](https://github.com/apache/datafusion-comet/pull/132) (advancedxy) +- build: Make the build system work out of box [#136](https://github.com/apache/datafusion-comet/pull/136) (advancedxy) +- minor: Update README.md with system diagram [#148](https://github.com/apache/datafusion-comet/pull/148) (alamb) +- test: Add golden files for test [#150](https://github.com/apache/datafusion-comet/pull/150) (snmvaughan) +- build: Add checker for PR title [#151](https://github.com/apache/datafusion-comet/pull/151) (sunchao) +- build: Support CI pipelines for Spark 3.2, 3.3 and 3.4 [#153](https://github.com/apache/datafusion-comet/pull/153) (advancedxy) +- minor: Only trigger PR title checker on pull requests [#154](https://github.com/apache/datafusion-comet/pull/154) (sunchao) +- chore: Fix warnings in both compiler and test environments [#164](https://github.com/apache/datafusion-comet/pull/164) (advancedxy) +- build: Upload test reports and coverage [#163](https://github.com/apache/datafusion-comet/pull/163) (advancedxy) +- minor: Remove unnecessary logic [#169](https://github.com/apache/datafusion-comet/pull/169) (sunchao) +- doc: Add initial doc how to expand Comet exceptions [#170](https://github.com/apache/datafusion-comet/pull/170) (comphead) +- minor: Make `QueryPlanSerde` warning log less confusing [#181](https://github.com/apache/datafusion-comet/pull/181) (viirya) +- refactor: Skipping slicing on shuffle arrays in shuffle reader [#189](https://github.com/apache/datafusion-comet/pull/189) (viirya) +- build: Run Spark SQL tests for 3.4 [#166](https://github.com/apache/datafusion-comet/pull/166) (sunchao) +- build: Enforce scalafix check in CI [#203](https://github.com/apache/datafusion-comet/pull/203) (advancedxy) +- doc: Update README.md with shuffle configs [#208](https://github.com/apache/datafusion-comet/pull/208) (viirya) +- test: Follow up on Spark 3.4 diff [#209](https://github.com/apache/datafusion-comet/pull/209) (sunchao) +- build: Avoid confusion by using profile with clean [#215](https://github.com/apache/datafusion-comet/pull/215) (snmvaughan) +- test: Add TPC-H test results [#218](https://github.com/apache/datafusion-comet/pull/218) (viirya) +- build: Add CI for TPC-H queries [#220](https://github.com/apache/datafusion-comet/pull/220) (viirya) +- test: Enable Comet shuffle in Spark SQL tests [#210](https://github.com/apache/datafusion-comet/pull/210) (sunchao) +- test: Disable spark ui in unit test by default [#235](https://github.com/apache/datafusion-comet/pull/235) (beryllw) +- chore: Replace deprecated temporal methods [#229](https://github.com/apache/datafusion-comet/pull/229) (snmvaughan) +- doc: Update supported expressions [#237](https://github.com/apache/datafusion-comet/pull/237) (viirya) +- build: Use specified branch of arrow-rs with workaround to invalid offset buffers from Java Arrow [#239](https://github.com/apache/datafusion-comet/pull/239) (viirya) +- test: Enable string-to-bool cast test [#251](https://github.com/apache/datafusion-comet/pull/251) (andygrove) +- test: Restore tests in CometTPCDSQuerySuite [#252](https://github.com/apache/datafusion-comet/pull/252) (viirya) +- test: Enable all remaining TPCDS queries [#254](https://github.com/apache/datafusion-comet/pull/254) (viirya) +- test: Enable all remaining TPCH queries [#257](https://github.com/apache/datafusion-comet/pull/257) (viirya) +- chore: Remove some calls to unwrap when calling create_expr in planner.rs [#269](https://github.com/apache/datafusion-comet/pull/269) (andygrove) +- doc: Fix a small typo in README.md [#272](https://github.com/apache/datafusion-comet/pull/272) (rz-vastdata) +- chore: Fix typo in info message [#279](https://github.com/apache/datafusion-comet/pull/279) (andygrove) +- chore: Fix NPE when running CometTPCHQueriesList directly [#285](https://github.com/apache/datafusion-comet/pull/285) (advancedxy) +- chore: Update Comet repo description [#291](https://github.com/apache/datafusion-comet/pull/291) (viirya) +- Chore: Cleanup how datafusion session config is created [#289](https://github.com/apache/datafusion-comet/pull/289) (psvri) +- build: Update asf.yaml to use `@datafusion.apache.org` [#294](https://github.com/apache/datafusion-comet/pull/294) (sunchao) +- doc: Update DataFusion project name and url [#300](https://github.com/apache/datafusion-comet/pull/300) (viirya) +- chore: Remove unused functions [#301](https://github.com/apache/datafusion-comet/pull/301) (kazuyukitanimura) +- chore: Ignore unused variables [#306](https://github.com/apache/datafusion-comet/pull/306) (snmvaughan) +- chore: Update documentation publishing domain and path [#310](https://github.com/apache/datafusion-comet/pull/310) (andygrove) +- chore: Add documentation publishing infrastructure [#314](https://github.com/apache/datafusion-comet/pull/314) (andygrove) +- build: Move shim directories [#318](https://github.com/apache/datafusion-comet/pull/318) (kazuyukitanimura) +- test: Suppress decimal random number tests for 3.2 and 3.3 [#319](https://github.com/apache/datafusion-comet/pull/319) (kazuyukitanimura) +- chore: Add allocation source to StreamReader [#332](https://github.com/apache/datafusion-comet/pull/332) (viirya) +- chore: Add more cast tests and improve test framework [#351](https://github.com/apache/datafusion-comet/pull/351) (andygrove) +- chore: Implement remaining CAST tests [#356](https://github.com/apache/datafusion-comet/pull/356) (andygrove) +- doc: Fix target typo in development.md [#364](https://github.com/apache/datafusion-comet/pull/364) (jc4x4) +- doc: Clean up supported JDKs in README [#366](https://github.com/apache/datafusion-comet/pull/366) (edmondop) +- build: Add Spark SQL test pipeline with ANSI mode enabled [#321](https://github.com/apache/datafusion-comet/pull/321) (parthchandra) +- doc: add contributing in README.md [#382](https://github.com/apache/datafusion-comet/pull/382) (caicancai) +- chore: Store EXTENSION_INFO as Set[String] instead of newline-delimited String [#386](https://github.com/apache/datafusion-comet/pull/386) (andygrove) +- build: Add scala-version to matrix [#396](https://github.com/apache/datafusion-comet/pull/396) (snmvaughan) +- chore: Add criterion benchmarks for casting between integer types [#401](https://github.com/apache/datafusion-comet/pull/401) (andygrove) +- chore: Make COMET_EXEC_BROADCAST_FORCE_ENABLED internal config [#413](https://github.com/apache/datafusion-comet/pull/413) (viirya) +- chore: Rename some columnar shuffle configs for code consistently [#418](https://github.com/apache/datafusion-comet/pull/418) (leoluan2009) +- chore: Remove an unused config [#430](https://github.com/apache/datafusion-comet/pull/430) (andygrove) +- doc: Add Plan Stability Testing to development guide [#432](https://github.com/apache/datafusion-comet/pull/432) (viirya) +- tests: Move random data generation methods from CometCastSuite to new DataGenerator class [#426](https://github.com/apache/datafusion-comet/pull/426) (andygrove) +- test: Fix explain with exteded info comet test [#436](https://github.com/apache/datafusion-comet/pull/436) (kazuyukitanimura) +- chore: Add cargo bench for shuffle writer [#438](https://github.com/apache/datafusion-comet/pull/438) (andygrove) +- doc: Add Tuning Guide with shuffle configs [#443](https://github.com/apache/datafusion-comet/pull/443) (viirya) +- chore: improve fallback message when comet native shuffle is not enabled [#445](https://github.com/apache/datafusion-comet/pull/445) (andygrove) +- Coverage: Add a manual test to show what Spark built in expression the DF can support directly [#331](https://github.com/apache/datafusion-comet/pull/331) (comphead) +- build: Add spark-4.0 profile and shims [#407](https://github.com/apache/datafusion-comet/pull/407) (kazuyukitanimura) +- build: bump spark version to 3.4.3 [#292](https://github.com/apache/datafusion-comet/pull/292) (huaxingao) +- chore: Removing copying data from dictionary values into CometDictionary [#490](https://github.com/apache/datafusion-comet/pull/490) (viirya) +- chore: Update README to highlight Comet benefits [#497](https://github.com/apache/datafusion-comet/pull/497) (andygrove) +- test: fix ClassNotFoundException for Hive tests [#499](https://github.com/apache/datafusion-comet/pull/499) (kazuyukitanimura) +- build: Enable comet tests with spark-4.0 profile [#493](https://github.com/apache/datafusion-comet/pull/493) (kazuyukitanimura) +- chore: Switch to stable Rust [#505](https://github.com/apache/datafusion-comet/pull/505) (andygrove) +- Minor: Generate the supported Spark builtin expression list into MD file [#455](https://github.com/apache/datafusion-comet/pull/455) (comphead) +- chore: Simplify code in CometExecIterator and avoid some small overhead [#522](https://github.com/apache/datafusion-comet/pull/522) (andygrove) +- chore: Upgrade spark to 4.0.0-preview1 [#526](https://github.com/apache/datafusion-comet/pull/526) (advancedxy) +- chore: Add UnboundColumn to carry datatype for unbound reference [#518](https://github.com/apache/datafusion-comet/pull/518) (viirya) +- chore: Remove 3.4.2.diff [#528](https://github.com/apache/datafusion-comet/pull/528) (kazuyukitanimura) +- build: Switch back to official DataFusion repo and arrow-rs after Arrow Java 16 is released [#403](https://github.com/apache/datafusion-comet/pull/403) (viirya) +- chore: Add CometEvalMode enum to replace string literals [#539](https://github.com/apache/datafusion-comet/pull/539) (andygrove) +- chore: Create initial release process scripts for official ASF source release [#429](https://github.com/apache/datafusion-comet/pull/429) (andygrove) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` +88 Liang-Chi Hsieh + 40 Andy Grove + 27 Chao Sun + 26 advancedxy + 11 Huaxin Gao + 8 KAZUYUKI TANIMURA + 7 Steve Vaughan + 7 comphead + 4 Parth Chandra + 4 Trent Hauck + 4 Vipul Vaibhaw + 3 Pablo Langa + 2 Edmondo Porcu + 2 Oleks V + 2 Vrishabh + 2 Xuedong Luan + 1 Andrew Lamb + 1 Brian Vaughan + 1 Cancai Cai + 1 Emil Ejbyfeldt + 1 Eren Avsarogullari + 1 Holden Karau + 1 JC + 1 Junbo wang + 1 Junfan Zhang + 1 Prashant K. Sharma + 1 RickestCode + 1 Rohit Rastogi + 1 Roman Zeyde + 1 Semyon + 1 Son + 1 Sujith Jay Nair + 1 Xin Hao + 1 Zhen Wang + 1 ceppelli + 1 dependabot[bot] + 1 thexia + 1 vidyasankarv + 1 wankun + 1 గణేష్ +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/release/README.md b/dev/release/README.md index b20f2d48e..2ca2f608e 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -34,6 +34,30 @@ This part of the process can be performed by any committer. - Create and merge a PR to update the version number & update the changelog - Push a release candidate tag (e.g. 0.1.0-rc1) to the Apache repository +### Generating the Change Log + +We haven't yet defined how tagging and branching will work for the source releases. This project is more complex +than DataFusion core because it consists of a Maven project and a Cargo project. However, generating a change log +to cover changes between any two commits or tags can be performed by running the provided `generate-changelog.py` +script. + +It is recommended that you set up a virtual Python environment and then install the dependencies: + +```shell +python3 -m venv venv +source venv/bin/activate +pip3 install -r requirements.in +``` + +To generate the changelog, set the `GITHUB_TOKEN` environment variable to a valid token and then run the script +providing two commit ids or tags followed by the version number of the release being created. The following +example generates a change log of all changes between the first commit and the current HEAD revision. + +```shell +export GITHUB_TOKEN= +python3 generate-changelog.py 52241f44315fd1b2fd6cd9031bb05f046fe3a5a3 HEAD 0.1.0 > ../changelog/0.1.0.md +``` + ## Publishing the Release Candidate This part of the process can mostly only be performed by a PMC member. diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py new file mode 100755 index 000000000..6793d7177 --- /dev/null +++ b/dev/release/generate-changelog.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +from github import Github +import os +import re +import subprocess + +def print_pulls(repo_name, title, pulls): + if len(pulls) > 0: + print("**{}:**".format(title)) + print() + for (pull, commit) in pulls: + url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) + print("- {} [#{}]({}) ({})".format(pull.title, pull.number, url, commit.author.login)) + print() + + +def generate_changelog(repo, repo_name, tag1, tag2, version): + + # get a list of commits between two tags + print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) + comparison = repo.compare(tag1, tag2) + + # get the pull requests for these commits + print("Fetching pull requests", file=sys.stderr) + unique_pulls = [] + all_pulls = [] + for commit in comparison.commits: + pulls = commit.get_pulls() + for pull in pulls: + # there can be multiple commits per PR if squash merge is not being used and + # in this case we should get all the author names, but for now just pick one + if pull.number not in unique_pulls: + unique_pulls.append(pull.number) + all_pulls.append((pull, commit)) + + # we split the pulls into categories + breaking = [] + bugs = [] + docs = [] + enhancements = [] + performance = [] + other = [] + + # categorize the pull requests based on GitHub labels + print("Categorizing pull requests", file=sys.stderr) + for (pull, commit) in all_pulls: + + # see if PR title uses Conventional Commits + cc_type = '' + cc_scope = '' + cc_breaking = '' + parts = re.findall(r'^([a-z]+)(\([a-z]+\))?(!)?:', pull.title) + if len(parts) == 1: + parts_tuple = parts[0] + cc_type = parts_tuple[0] # fix, feat, docs, chore + cc_scope = parts_tuple[1] # component within project + cc_breaking = parts_tuple[2] == '!' + + labels = [label.name for label in pull.labels] + if 'api change' in labels or cc_breaking: + breaking.append((pull, commit)) + elif 'bug' in labels or cc_type == 'fix': + bugs.append((pull, commit)) + elif 'performance' in labels or cc_type == 'perf': + performance.append((pull, commit)) + elif 'enhancement' in labels or cc_type == 'feat': + enhancements.append((pull, commit)) + elif 'documentation' in labels or cc_type == 'docs': + docs.append((pull, commit)) + else: + other.append((pull, commit)) + + # produce the changelog content + print("Generating changelog content", file=sys.stderr) + + # ASF header + print("""\n""") + + print(f"# DataFusion Comet {version} Changelog\n") + + # get the number of commits + commit_count = subprocess.check_output(f"git log --pretty=oneline {tag1}..{tag2} | wc -l", shell=True, text=True).strip() + + # get number of contributors + contributor_count = subprocess.check_output(f"git shortlog -sn {tag1}..{tag2} | wc -l", shell=True, text=True).strip() + + print(f"This release consists of {commit_count} commits from {contributor_count} contributors. " + f"See credits at the end of this changelog for more information.\n") + + print_pulls(repo_name, "Breaking changes", breaking) + print_pulls(repo_name, "Performance related", performance) + print_pulls(repo_name, "Implemented enhancements", enhancements) + print_pulls(repo_name, "Fixed bugs", bugs) + print_pulls(repo_name, "Documentation updates", docs) + print_pulls(repo_name, "Other", other) + + # show code contributions + credits = subprocess.check_output(f"git shortlog -sn {tag1}..{tag2}", shell=True, text=True).strip() + + print("## Credits\n") + print("Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) " + "per contributor.\n") + print("```") + print(credits) + print("```\n") + + print("Thank you also to everyone who contributed in other ways such as filing issues, reviewing " + "PRs, and providing feedback on this release.\n") + +def cli(args=None): + """Process command line arguments.""" + if not args: + args = sys.argv[1:] + + parser = argparse.ArgumentParser() + parser.add_argument("tag1", help="The previous commit or tag (e.g. 0.1.0)") + parser.add_argument("tag2", help="The current commit or tag (e.g. HEAD)") + parser.add_argument("version", help="The version number to include in the changelog") + args = parser.parse_args() + + token = os.getenv("GITHUB_TOKEN") + project = "apache/datafusion-comet" + + g = Github(token) + repo = g.get_repo(project) + generate_changelog(repo, project, args.tag1, args.tag2, args.version) + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/dev/release/requirements.in b/dev/release/requirements.in new file mode 100644 index 000000000..ff2bdfd42 --- /dev/null +++ b/dev/release/requirements.in @@ -0,0 +1 @@ +PyGitHub \ No newline at end of file