diff --git a/.github/workflows/build-python-wheels.yml b/.github/workflows/build-python-wheels.yml index be723bf1..4961ac8c 100644 --- a/.github/workflows/build-python-wheels.yml +++ b/.github/workflows/build-python-wheels.yml @@ -69,7 +69,7 @@ jobs: strategy: matrix: os: [windows-latest, macOS-latest] - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/python-upload-test.yml b/.github/workflows/python-upload-test.yml index c87c58cf..4c8af786 100644 --- a/.github/workflows/python-upload-test.yml +++ b/.github/workflows/python-upload-test.yml @@ -19,10 +19,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install --upgrade setuptools setuptools-rust build + python -m pip install --upgrade setuptools setuptools-rust build packaging - - name: Make .devXX version - run: python ./python/latest_dev_version.py + - name: Modify version for TestPyPI upload + run: python ./python/modify_version_for_testpypi.py - name: Build sdist working-directory: ./python @@ -52,8 +52,18 @@ jobs: target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - - name: Make .devXX version - run: python ./python/latest_dev_version.py + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install -U pip + python -m pip install -U packaging + + - name: Modify version for TestPyPI upload + run: python ./python/modify_version_for_testpypi.py - uses: eiennohito/gha-manylinux-build@master with: @@ -70,7 +80,7 @@ jobs: strategy: matrix: os: [windows-latest, macOS-latest] - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] steps: - uses: actions/checkout@v4 @@ -89,17 +99,17 @@ jobs: target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - - name: Make .devXX version - run: python ./python/latest_dev_version.py - - - name: Add aarch64 target for Rust - run: rustup target add aarch64-apple-darwin - if: startsWith(matrix.os, 'macOS') - - name: Install dependencies run: | python -m pip install -U pip - python -m pip install -U setuptools setuptools_rust build + python -m pip install -U setuptools setuptools_rust build packaging + + - name: Modify version for TestPyPI upload + run: python ./python/modify_version_for_testpypi.py + + - name: Add aarch64/x86 target for Rust + run: rustup target add aarch64-apple-darwin x86_64-apple-darwin + if: startsWith(matrix.os, 'macOS') - name: Build wheel working-directory: ./python @@ -139,7 +149,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest, windows-latest, macOS-latest ] - python-version: [ "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] fail-fast: false runs-on: ${{ matrix.os }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 17e90916..8f669b1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,148 +1,179 @@ -# [0.6.8](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.8) (2023-12-14) +# Changelog -## Highlights +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). -* Produce builds for Python 3.12 (#236) -* Add a simple [configuration API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#config-config) -* Add surface projections (#230) +Also check [python changelog](python/CHANGELOG.md). -## Surface Projections +## [Unreleased] -* For chiTra compatibility SudachiPy can now directly produce different tokens in the surface field. -* Original surface is accessible via [`Morheme.raw_surface()`](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.Morpheme.raw_surface) method -* It is possible to customize projection dictionary-wise, via Config object, passing it on a dictionary creation, or for a single pre-tokenizer. - * [Config API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.config.Config.projection) - * [Pretokenizer API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.Dictionary.pre_tokenizer) +## [0.6.9](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.9) (2024-11-20) -# [0.6.7](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.7) (2023-02-16) +### Added -## Highlights +- freebsd support (#222 by @KonstantinDjairo, #251) +- Add rust minimum support version (#255) +- Add option for embedded config and fallback resources (#262 by @Kuuuube) -* Provide binary wheels for Python 3.11 -* Add `Dictionary.lookup()` method which allows you to enumerate morphemes from the dictionary without performing analysis. +### Changed -# [0.6.6](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.6) (2022-07-25) +- `fetch_dictionary.sh` targets latest dictionary by default (#240) +- update dependencies (#241, #246) +- Migrate from structopt to clap (#248 by @tkhshtsh0917) -## Highlights -* Add [boundary matching mode](https://github.com/WorksApplications/Sudachi/blob/develop/docs/oov_handlers.md) to regex oov handler -* macOS binary builds are now unversal2 (arm+x64) +## [0.6.8](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.8) (2023-12-14) -## MacOS -* Binary builds are universal2 -* Caveat: we don't run tests on arm because there are no public arm instances, so builds may be broken without any warning +### Highlights -# [0.6.5](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.5) (2022-06-21) +- Produce builds for Python 3.12 (#236) +- Add a simple [configuration API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#config-config) +- Add surface projections (#230) -## Highlights +### Surface Projections -* Fixed invalid POS tags which appeared when using user-defined POS tags both in user dictionaries and OOV handlers. +- For chiTra compatibility SudachiPy can now directly produce different tokens in the surface field. +- Original surface is accessible via [`Morheme.raw_surface()`](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.Morpheme.raw_surface) method +- It is possible to customize projection dictionary-wise, via Config object, passing it on a dictionary creation, or for a single pre-tokenizer. + - [Config API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.config.Config.projection) + - [Pretokenizer API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.Dictionary.pre_tokenizer) + +## [0.6.7](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.7) (2023-02-16) + +### Highlights + +- Provide binary wheels for Python 3.11 +- Add `Dictionary.lookup()` method which allows you to enumerate morphemes from the dictionary without performing analysis. + +## [0.6.6](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.6) (2022-07-25) + +### Highlights + +- Add [boundary matching mode](https://github.com/WorksApplications/Sudachi/blob/develop/docs/oov_handlers.md) to regex oov handler +- macOS binary builds are now unversal2 (arm+x64) + +### MacOS + +- Binary builds are universal2 +- Caveat: we don't run tests on arm because there are no public arm instances, so builds may be broken without any warning + +## [0.6.5](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.5) (2022-06-21) + +### Highlights + +- Fixed invalid POS tags which appeared when using user-defined POS tags both in user dictionaries and OOV handlers. You are not affected by this bug if you did not use user-defined POS in OOV handlers. -# [0.6.4](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.3) (2022-06-16) +## [0.6.4](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.4) (2022-06-16) -## Highlights +### Highlights -* Remove Python 3.6 support which reached end-of-life status on [2021-12-23](https://endoflife.date/python) -* OOV handler plugins support user-defined POS, [similar to Java version](https://github.com/WorksApplications/Sudachi/releases/tag/v0.6.0) -* Added Regex OOV handler +- Remove Python 3.6 support which reached end-of-life status on [2021-12-23](https://endoflife.date/python) +- OOV handler plugins support user-defined POS, [similar to Java version](https://github.com/WorksApplications/Sudachi/releases/tag/v0.6.0) +- Added Regex OOV handler -## Regex OOV Handler +### Regex OOV Handler -* For details, see [Java version changelog](https://github.com/WorksApplications/Sudachi/releases/tag/v0.6.0) -* In Rust/Python Regexes do not support backtracking and backreferences -* `maxLength` setting defines maximum length in unicode codepoints, not in utf-8 bytes as in Java (will be changed to codepoints later) +- For details, see [Java version changelog](https://github.com/WorksApplications/Sudachi/releases/tag/v0.6.0) +- In Rust/Python Regexes do not support backtracking and backreferences +- `maxLength` setting defines maximum length in unicode codepoints, not in utf-8 bytes as in Java (will be changed to codepoints later) -# [0.6.3](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.3) (2022-02-10) +## [0.6.3](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.3) (2022-02-10) -## Highlights +### Highlights -* Fixed path resolution algorithm for resources. They are now resolved in the following order (first existing file wins): +- Fixed path resolution algorithm for resources. They are now resolved in the following order (first existing file wins): 1. Absolute paths stay as they are 2. Relative to "path" value of the config file 3. Relative to "resource_dir" parameter of the config object during creation - * For SudachiPy it is the parameter of `Dictionary` constructor + - For SudachiPy it is the parameter of `Dictionary` constructor 4. Relative to the location of the configuration file 5. Relative to the current directory -## Python +### Python + +- `Dictionary` now has `__repr__()` function which displays absolute paths to dictionaries in use. +- `Dictionary` now has `pos_of()` function which returns a POS tuple for a given POS id. +- `PosMatcher` supports set operations + - union (`m1 | m2`) + - intersection (`m1 & m2`) + - difference (`m1 - m2`) + - negation (`~m1`) + +## [0.6.2](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.2) (2021-12-09) + +### Fixes + +- Fix analysis differences with 0.5.4 + +## [0.6.1](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.1) (2021-12-08) + +### Highlights + +- Added Fuzzing (see `sudachi-fuzz` subdirectory), Sudachi.rs seems to be pretty robust towards arbitrary inputs (no crashes and panics) + - Issues like https://github.com/WorksApplications/sudachi.rs/issues/182 should never occur more +- ~5% analysis speed improvement over 0.6.0 +- Added support for Unicode combining symbols, now Sudachi.rs/py should be much better with emoji (🎅🏾) and more complex Unicode (İstanbul) -* `Dictionary` now has `__repr__()` function which displays absolute paths to dictionaries in use. -* `Dictionary` now has `pos_of()` function which returns a POS tuple for a given POS id. -* `PosMatcher` supports set operations - * union (`m1 | m2`) - * intersection (`m1 & m2`) - * difference (`m1 - m2`) - * negation (`~m1`) +### Rust -# [0.6.2](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.2) (2021-12-09) +- Added partial dictionary read functionality, it is now possible to skip reading certain fields if they are not needed +- Improved startup times, especially for debug builds -## Fixes +### Python -* Fix analysis differences with 0.5.4 +- See [Python changelog](./python/CHANGELOG.md) -# [0.6.1](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.1) (2021-12-08) +## [0.6.0](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.0) (2021-11-11) -## Highlights -* Added Fuzzing (see `sudachi-fuzz` subdirectory), Sudachi.rs seems to be pretty robust towards arbitrary inputs (no crashes and panics) - * Issues like https://github.com/WorksApplications/sudachi.rs/issues/182 should never occur more -* ~5% analysis speed improvement over 0.6.0 -* Added support for Unicode combining symbols, now Sudachi.rs/py should be much better with emoji (🎅🏾) and more complex Unicode (İstanbul) +### Highlights -## Rust -* Added partial dictionary read functionality, it is now possible to skip reading certain fields if they are not needed -* Improved startup times, especially for debug builds +- Full feature parity with Java version +- ~15% analysis speed improvement over 0.6.0-rc1 -## Python -* See [Python changelog](./python/CHANGELOG.md) +### Rust -# [0.6.0](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.0) (2021-11-11) -## Highlights -* Full feature parity with Java version -* ~15% analysis speed improvement over 0.6.0-rc1 +- Added dictionary build functionality + - https://github.com/WorksApplications/sudachi.rs/pull/143 +- Added an option to perform analysis without sentence splitting + - Use it with `--split-sentences=no` -## Rust -* Added dictionary build functionality - * https://github.com/WorksApplications/sudachi.rs/pull/143 -* Added an option to perform analysis without sentence splitting - * Use it with `--split-sentences=no` +### Python -## Python -* Added bindings for dictionary build (undocumented and not supported as API). - * See https://github.com/WorksApplications/sudachi.rs/issues/157 -* `sudachipy build` and `sudachipy ubuild` should work once more - * Report on build times and dictionary part sizes can differ from the original SudachiPy +- Added bindings for dictionary build (undocumented and not supported as API). + - See https://github.com/WorksApplications/sudachi.rs/issues/157 +- `sudachipy build` and `sudachipy ubuild` should work once more + - Report on build times and dictionary part sizes can differ from the original SudachiPy +## [0.6.0-rc1](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.0-rc1) (2021-10-26) -# [0.6.0-rc1](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.0-rc1) (2021-10-26) -## Highlights +### Highlights -* First release of Sudachi.rs -* SudachiPy compatible Python bindings -* ~30x speed improvement over original SudachiPy -* Dictionary build mode will be done before 0.6.0 final (See #13) +- First release of Sudachi.rs +- SudachiPy compatible Python bindings +- ~30x speed improvement over original SudachiPy +- Dictionary build mode will be done before 0.6.0 final (See #13) -## Rust +### Rust -* Analysis: feature parity with Python and Java version -* Dictionary build is not supported in rc1 -* ~2x faster than Java version (with sentence splitting) -* No public API at the moment (contact us if you want to use Rust version directly, internals will significantly change and names are not finalized) +- Analysis: feature parity with Python and Java version +- Dictionary build is not supported in rc1 +- ~2x faster than Java version (with sentence splitting) +- No public API at the moment (contact us if you want to use Rust version directly, internals will significantly change and names are not finalized) -## Python +### Python -* Mostly compatible with SudachiPy 0.5.4 -* We provide binary wheels for popular platforms -* ~30x faster than 0.5.4 -* IgnoreYomigana input text plugin is now supported (and enabled by default) -* We provide [binary wheels for convenience (and additional speed on Linux)](https://worksapplications.github.io/sudachi.rs/python/wheels.html) +- Mostly compatible with SudachiPy 0.5.4 +- We provide binary wheels for popular platforms +- ~30x faster than 0.5.4 +- IgnoreYomigana input text plugin is now supported (and enabled by default) +- We provide [binary wheels for convenience (and additional speed on Linux)](https://worksapplications.github.io/sudachi.rs/python/wheels.html) -## Known Issues +### Known Issues -* List of deprecated SudachiPy API: - * `MorphemeList.empty(dict: Dictionary)` - * This also needs a dictionary as an argument. - * `Morpheme.split(mode: SplitMode)` - * `Morpheme.get_word_info()` - * Most of instance attributes are not exported: e.g. `Dictionary.grammar`, `Dictionary.lexicon`. - * See [API reference page](https://worksapplications.github.io/sudachi.rs/python/) for supported APIs. -* Dictionary Build is not supported: `sudachipy build` and `sudachipy ubuild` will not work, please use 0.5.3 in another virtual environment for the time being until the feature is implemented: #13 \ No newline at end of file +- List of deprecated SudachiPy API: + - `MorphemeList.empty(dict: Dictionary)` + - This also needs a dictionary as an argument. + - `Morpheme.split(mode: SplitMode)` + - `Morpheme.get_word_info()` + - Most of instance attributes are not exported: e.g. `Dictionary.grammar`, `Dictionary.lexicon`. + - See [API reference page](https://worksapplications.github.io/sudachi.rs/python/) for supported APIs. +- Dictionary Build is not supported: `sudachipy build` and `sudachipy ubuild` will not work, please use 0.5.3 in another virtual environment for the time being until the feature is implemented: #13 diff --git a/Cargo.lock b/Cargo.lock index de284ecd..31a5ee30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -289,7 +289,7 @@ dependencies = [ [[package]] name = "default_input_text" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "sudachi", ] @@ -432,14 +432,14 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "join_katakana_oov" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "sudachi", ] [[package]] name = "join_numeric" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "sudachi", ] @@ -794,7 +794,7 @@ dependencies = [ [[package]] name = "simple_oov" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "sudachi", ] @@ -807,7 +807,7 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "sudachi" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "aho-corasick", "bitflags", @@ -835,7 +835,7 @@ dependencies = [ [[package]] name = "sudachi-cli" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "cfg-if", "clap", @@ -855,7 +855,7 @@ dependencies = [ [[package]] name = "sudachipy" -version = "0.6.9-a1" +version = "0.6.9" dependencies = [ "pyo3", "scopeguard", diff --git a/Cargo.toml b/Cargo.toml index ef063507..530bed1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ members = [ default-members = ["sudachi", "sudachi-cli"] [workspace.package] -version = "0.6.9-a1" +version = "0.6.10-a1" authors = ["Works Applications "] edition = "2021" rust-version = "1.74.1" diff --git a/README.ja.md b/README.ja.md index 21fd80c1..295b6a1e 100644 --- a/README.ja.md +++ b/README.ja.md @@ -1,18 +1,21 @@ -# sudachi.rs - 日本語README +# sudachi.rs - 日本語 README

sudachi.rs logo

-sudachi.rs は日本語形態素解析器 [Sudachi](https://github.com/WorksApplications/Sudachi) のRust実装です。 +sudachi.rs は日本語形態素解析器 [Sudachi](https://github.com/WorksApplications/Sudachi) の Rust 実装です。 -[English README](README.md) [SudachiPy Documentation](https://worksapplications.github.io/sudachi.rs/python) +[English README](README.md), [SudachiPy Documentation](./python/README.md) ## TL;DR -SudachiPyとして使うには +Python 版のインストール: + ```bash -$ pip install --upgrade 'sudachipy>=0.6.8' +pip install --upgrade 'sudachipy>=0.6.9' ``` +Rust 版のインストール: + ```bash $ git clone https://github.com/WorksApplications/sudachi.rs.git $ cd ./sudachi.rs @@ -30,7 +33,7 @@ EOS 複数粒度での分割 -``` +```sh $ echo 選挙管理委員会 | sudachi 選挙管理委員会 名詞,固有名詞,一般,*,*,* 選挙管理委員会 EOS @@ -45,7 +48,7 @@ EOS 正規化表記 -``` +```sh $ echo 打込む かつ丼 附属 vintage | sudachi 打込む 動詞,一般,*,*,五段-マ行,終止形-一般 打ち込む 空白,*,*,*,*,* @@ -59,7 +62,7 @@ EOS 分かち書き出力 -``` +```sh $ cat lemon.txt えたいの知れない不吉な塊が私の心を始終圧えつけていた。 焦躁と言おうか、嫌悪と言おうか――酒を飲んだあとに宿酔があるように、酒を毎日飲んでいると宿酔に相当した時期がやって来る。 @@ -71,28 +74,31 @@ $ sudachi --wakati lemon.txt それ が 来 た の だ 。 これ は ちょっと いけ なかっ た 。 ``` - ## セットアップ -sudachi.rs本体に加え、デフォルトで使用するプラグイン、また辞書が必要になります。※パッケージには辞書が含まれていません。 +sudachi.rs 本体に加え、デフォルトで使用するプラグイン、また辞書が必要になります。※パッケージには辞書が含まれていません。 ### 1. ソースコードの取得 -``` -$ git clone https://github.com/WorksApplications/sudachi.rs.git +```sh +git clone https://github.com/WorksApplications/sudachi.rs.git ``` -### 2. Sudachi辞書のダウンロード +### 2. Sudachi 辞書のダウンロード -[WorksApplications/SudachiDict](https://github.com/WorksApplications/SudachiDict)から辞書のzipファイル( `small` 、 `core` 、 `full` から一つ選択)し、解凍して、必要であれば中にある `system_*.dic` ファイルをわかりやすい位置に置いてください。 +[WorksApplications/SudachiDict](https://github.com/WorksApplications/SudachiDict)から辞書の zip ファイル( `small` 、 `core` 、 `full` から一つ選択)し、解凍して、必要であれば中にある `system_*.dic` ファイルをわかりやすい位置に置いてください。 デフォルトの設定ファイルでは、辞書ファイルが `resources/system.dic` に存在していると指定しています(ファイル名が `system.dic` に変わっていることに注意)。 #### ダウンロードスクリプト 上記のように手動で設置する以外に、レポジトリにあるスクリプトを使って自動的に辞書をダウンロードし `resources/system.dic` として設置することもできます。 -``` -$ ./fetch_dictionary.sh +```sh +# fetch latest core dictionary +./fetch_dictionary.sh + +# fetch dictionary of specified version and type +./fetch_dictionary.sh 20241021 small ``` ### 3. ビルド @@ -101,8 +107,8 @@ $ ./fetch_dictionary.sh `--all` フラグを使って付属のプラグインもまとめてビルドすることができます。 -``` -$ cargo build --release +```sh +cargo build --release ``` #### ビルド(辞書バイナリの埋め込み) @@ -116,7 +122,8 @@ $ cargo build --release ビルド時、埋め込む辞書へのパスを `SUDACHI_DICT_PATH` 環境変数によって指定する必要があります。 このパスは絶対パスもしくは sudachi.rs ディレクトリからの相対パスで指定してください。 -Unix-likeシステムでの例: +Unix-like システムでの例: + ```sh # resources/system.dic への辞書ダウンロード $ ./fetch_dictionary.sh @@ -130,10 +137,11 @@ $ env SUDACHI_DICT_PATH=resources/system.dic cargo build --release --features ba $ env SUDACHI_DICT_PATH=/path/to/my-sudachi.dic cargo build --release --features bake_dictionary ``` - ### 4. インストール -``` -sudachi.rs/ $ cargo install --path sudachi-cli/ + +```sh +$ cd sudachi.rs/ +$ cargo install --path sudachi-cli/ $ which sudachi /Users//.cargo/bin/sudachi @@ -144,7 +152,6 @@ A Japanese tokenizer ... ``` - ## 利用方法 ```bash @@ -204,12 +211,12 @@ Options: - 辞書形 - 読み -- 辞書ID - - 0 システム辞書 - - 1 ユーザー辞書 - - -1 未知語(辞書に含まれない単語) -- 同義語グループID -- "OOV" 未知語(辞書に含まれない単語)の場合のみ +- 辞書 ID + - `0` システム辞書 + - `1` ユーザー辞書 + - `-1` 未知語(辞書に含まれない単語) +- 同義語グループ ID +- `(OOV)` 未知語(辞書に含まれない単語)の場合のみ ```bash $ echo "外国人参政権" | sudachi -a @@ -231,14 +238,12 @@ $ echo "外国人参政権" | sudachi -m A -w 外国 人 参政 権 ``` - ## ToDo - [x] 未知語処理 -- [ ] 簡単な辞書ファイルのインストール、管理([SudachiPyでの方式を参考に](https://github.com/WorksApplications/SudachiPy/issues/73)) +- [ ] 簡単な辞書ファイルのインストール、管理([SudachiPy での方式を参考に](https://github.com/WorksApplications/SudachiPy/issues/73)) - [ ] crates.io への登録 - ## リファレンス ### Sudachi @@ -248,12 +253,12 @@ $ echo "外国人参政権" | sudachi -m A -w - [WorksApplications/SudachiPy](https://github.com/WorksApplications/SudachiPy) - [msnoigrs/gosudachi](https://github.com/msnoigrs/gosudachi) -### Rustによる形態素解析器の実装 +### Rust による形態素解析器の実装 - [agatan/yoin: A Japanese Morphological Analyzer written in pure Rust](https://github.com/agatan/yoin) - [wareya/notmecab-rs: notmecab-rs is a very basic mecab clone, designed only to do parsing, not training.](https://github.com/wareya/notmecab-rs) ### ロゴ -- [Sudachiのロゴ](https://github.com/WorksApplications/Sudachi/blob/develop/docs/Sudachi.png) +- [Sudachi のロゴ](https://github.com/WorksApplications/Sudachi/blob/develop/docs/Sudachi.png) - カニのイラスト: [Pixabay](https://pixabay.com/ja/vectors/%E5%8B%95%E7%89%A9-%E3%82%AB%E3%83%8B-%E7%94%B2%E6%AE%BB%E9%A1%9E-%E6%B5%B7-2029728/) diff --git a/README.md b/README.md index e21bb2bc..0bf53ec5 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,22 @@ [![Rust](https://github.com/WorksApplications/sudachi.rs/actions/workflows/rust.yml/badge.svg)](https://github.com/WorksApplications/sudachi.rs/actions/workflows/rust.yml) -**2023-12-14 UPDATE**: [0.6.8 Release](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.7) - -Try it: -```shell -pip install --upgrade 'sudachipy>=0.6.8' -``` - -

sudachi.rs logo

sudachi.rs is a Rust implementation of [Sudachi](https://github.com/WorksApplications/Sudachi), a Japanese morphological analyzer. -[日本語 README](README.ja.md) [SudachiPy Documentation](https://worksapplications.github.io/sudachi.rs/python) +[日本語 README](README.ja.md), [SudachiPy Documentation](./python/README.md) ## TL;DR +Install Python version + +```bash +pip install --upgrade 'sudachipy>=0.6.9' +``` + +or Rust version + ```bash $ git clone https://github.com/WorksApplications/sudachi.rs.git $ cd ./sudachi.rs @@ -76,15 +76,14 @@ $ sudachi --wakati lemon.txt それ が 来 た の だ 。 これ は ちょっと いけ なかっ た 。 ``` - ## Setup You need sudachi.rs, default plugins, and a dictionary. (This crate don't include dictionary.) ### 1. Get the source code -``` -$ git clone https://github.com/WorksApplications/sudachi.rs.git +```sh +git clone https://github.com/WorksApplications/sudachi.rs.git ``` ### 2. Download a Sudachi Dictionary @@ -95,16 +94,20 @@ By the default setting file, sudachi.rs assumes that it is placed at `resources/ #### Convenience Script -Optionally, you can use the [`fetch_dictionary.sh`](fetch_dictionary.sh) shell script to download a dictionary and install it to `resources/system.dic`. +Optionally, you can use the [`fetch_dictionary.sh`](fetch_dictionary.sh) shell script to download a dictionary and install it to `resources/system.dic` (overrides). -``` -$ ./fetch_dictionary.sh +```sh +# fetch latest core dictionary +./fetch_dictionary.sh + +# fetch dictionary of specified version and type +./fetch_dictionary.sh 20241021 small ``` ### 3. Build -``` -$ cargo build --release +```sh +cargo build --release ``` #### Build (bake dictionary into binary) @@ -119,6 +122,7 @@ You must specify the path the dictionary file in the `SUDACHI_DICT_PATH` environ `SUDACHI_DICT_PATH` is relative to the sudachi.rs directory (or absolute). Example on Unix-like system: + ```sh # Download dictionary to resources/system.dic $ ./fetch_dictionary.sh @@ -132,11 +136,11 @@ $ env SUDACHI_DICT_PATH=resources/system.dic cargo build --release --features ba $ env SUDACHI_DICT_PATH=/path/to/my-sudachi.dic cargo build --release --features bake_dictionary ``` - ### 4. Install -``` -sudachi.rs/ $ cargo install --path sudachi-cli/ +```sh +$ cd sudachi.rs/ +$ cargo install --path sudachi-cli/ $ which sudachi /Users//.cargo/bin/sudachi @@ -147,7 +151,6 @@ A Japanese tokenizer ... ``` - ## Usage as a command ```bash @@ -234,6 +237,9 @@ $ echo "外国人参政権" | sudachi -m A -w 外国 人 参政 権 ``` +## API + +See [API reference page](https://worksapplications.github.io/sudachi.rs/rust/sudachi/). ## ToDo @@ -241,7 +247,6 @@ $ echo "外国人参政権" | sudachi -m A -w - [ ] Easy dictionary file install & management, [similar to SudachiPy](https://github.com/WorksApplications/SudachiPy/issues/73) - [ ] Registration to crates.io - ## References ### Sudachi @@ -251,7 +256,6 @@ $ echo "外国人参政権" | sudachi -m A -w - [WorksApplications/SudachiPy](https://github.com/WorksApplications/SudachiPy) - [msnoigrs/gosudachi](https://github.com/msnoigrs/gosudachi) - ### Morphological Analyzers in Rust - [agatan/yoin: A Japanese Morphological Analyzer written in pure Rust](https://github.com/agatan/yoin) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 7288a241..c15de255 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -1,9 +1,88 @@ -# Change log +# Changelog -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). -## Unreleased +Also check [rust changelog](../CHANGELOG.md). +## [Unreleased] + +## [0.6.9](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.9) (2024-11-20) + +### Added + +- Allow string literals as `SplitMode` (#245) +- Add `sudachipy.Config` and `sudachipy.errors.SudachiError` to default import (#260) +- Add support for Python3.13 + - Python3.13t (no GIL) is not supported yet + - by Updating PyO3 dependency to v0.22 (#265, #276) + +### Changed + +- `-s` (system dictionary path) of `sudachi ubuild` command is now required (#239) +- Migrate from setup.py install (#252) +- `-d` option of sudachi cli (which is no-op) now warns (#278) +- Update the output of `sudachi dump` subcommand (#277) + +### Fixed + +- Documentation fix/update (#247 by @t-yamamura, #250, #268) +- Change the way how python error is raised (#273) +- Fix clippy warnings without breaking changes (#263) + +### Removed + +- Remove Python 3.7 and 3.8 support as it reaches its end of life (https://devguide.python.org/versions/) (#249, #281). + +## [0.6.8](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.8) (2023-12-14) + +- Produce builds for Python 3.12 (#236) +- Add a simple [configuration API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#config-config) +- Add surface projections (#230) + +### Surface Projections + +- For chiTra compatibility SudachiPy can now directly produce different tokens in the surface field. +- Original surface is accessible via [`Morheme.raw_surface()`](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.Morpheme.raw_surface) method +- It is possible to customize projection dictionary-wise, via Config object, passing it on a dictionary creation, or for a single pre-tokenizer. + - [Config API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.config.Config.projection) + - [Pretokenizer API](https://worksapplications.github.io/sudachi.rs/python/api/sudachipy.html#sudachipy.Dictionary.pre_tokenizer) + +## [0.6.7](https://github.com/WorksApplications/sudachi.rs/releases/tag/v0.6.7) (2023-02-16) + +- Produce builds for Python 3.11 +- Add `Dictionary.lookup()` method which allows you to enumerate morphemes from the dictionary without performing analysis. + +## [0.6.6] (2022-07-25) + +- Add [boundary matching mode](https://github.com/WorksApplications/Sudachi/blob/develop/docs/oov_handlers.md) to regex oov handler +- macOS binary builds are now unversal2 (arm+x64) + +### MacOS + +- Binary builds are universal2 (arm+x64) +- Caveat: we don't run tests on arm because there are no public arm instances, so builds may be broken without any warning + +## [0.6.5] (2022-06-21) + +- Fixed invalid POS tags which appeared when using user-defined POS tags both in user dictionaries and OOV handlers. + You are not affected by this bug if you did not use user-defined POS in OOV handlers. + +## [0.6.4] (2022-06-16) + +### Added + +- OOV handler plugins support user-defined POS, [similar to Java version](https://github.com/WorksApplications/Sudachi/releases/tag/v0.6.0) +- Added Regex OOV handler + +### Regex OOV Handler + +- For details, see [Java version changelog](https://github.com/WorksApplications/Sudachi/releases/tag/v0.6.0) +- In Rust/Python Regexes do not support backtracking and backreferences +- `maxLength` setting defines maximum length in unicode codepoints, not in utf-8 bytes as in Java (will be changed to codepoints later) + +### Removed + +- Remove Python 3.6 support which reached end-of-life status on [2021-12-23](https://endoflife.date/python) - Print Debug feature is disabled now. - `-d` option of `sudachipy` cli does nothing. - `sudachipy.Tokenizer` will ignore the provided logger. @@ -46,11 +125,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - https://github.com/WorksApplications/sudachi.rs/pull/187 ### Deprecated -* `dict_type` parameter of `Dictionary()` constructor. Use `dict` instead which is a complete alias. + +- `dict_type` parameter of `Dictionary()` constructor. Use `dict` instead which is a complete alias. ### Note -* Do not use `mode` parameter of `Tokenizer.tokenize()` method if you always tokenize with a single mode. - * Use the mode parameter of `Dictionary.create()` method instead. + +- Do not use `mode` parameter of `Tokenizer.tokenize()` method if you always tokenize with a single mode. + - Use the mode parameter of `Dictionary.create()` method instead. ## [0.6.0] - 2021/10/11 @@ -66,38 +147,37 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - From this version, SudachiPy is provided as a binding of [the Rust implementation](https://github.com/WorksApplications/sudachi.rs). - See [API reference page](https://worksapplications.github.io/sudachi.rs/python/) for all APIs. - Since this is release-candidate version, you need to explicitly specify version to install. - - `pip install sudachipy==0.6.0rc1` - - You also need to install `sudachidict_*` before since installing it will overwrite this version. + - `pip install sudachipy==0.6.0rc1` + - You also need to install `sudachidict_*` before since installing it will overwrite this version. ### Changed - Module structure changed: every classes locate at the root module. - - Import is now like: `from sudachipy import Dictionary, Tokenizer` - - You can still import them in the previous way (not recommended). - - `from sudachipy.dictionary import Dictionary` + - Import is now like: `from sudachipy import Dictionary, Tokenizer` + - You can still import them in the previous way (not recommended). + - `from sudachipy.dictionary import Dictionary` - `MorphemeList.empty` now needs a `sudachipy.Dictionary` instance as arguments. - - __This method is also marked as deprecated.__ + - **This method is also marked as deprecated.** ### Deprecated - `MorphemeList.empty(dict)` - - Users should not generate MorphemeList by themselves. - - Use `Tokenizer.tokenize("")` if you need. + - Users should not generate MorphemeList by themselves. + - Use `Tokenizer.tokenize("")` if you need. - `Morpheme.get_word_info()` - - Users should not touch the raw WordInfo. - - Necessary fields are provided via `Morpheme`. - - Please create an issue if fields you need is not implemented to `Morpheme`. + - Users should not touch the raw WordInfo. + - Necessary fields are provided via `Morpheme`. + - Please create an issue if fields you need is not implemented to `Morpheme`. - `Morpheme.split(mode)` - - The API around this feature will change. - - See issue [#92]. + - The API around this feature will change. + - See issue [#92]. ### Removed - Some of APIs are not supported. - - See [API reference page](https://worksapplications.github.io/sudachi.rs/python/) for the full list of supported APIs. + - See [API reference page](https://worksapplications.github.io/sudachi.rs/python/) for the full list of supported APIs. - Most of instance attributes are unaccessible. - - You cannot access `Dictionary.grammar` or `Dictionary.lexicon`. - + - You cannot access `Dictionary.grammar` or `Dictionary.lexicon`. ## [0.5.4] diff --git a/python/README.md b/python/README.md index b1ad3e5e..71289aab 100644 --- a/python/README.md +++ b/python/README.md @@ -1,4 +1,5 @@ # SudachiPy + [![PyPi version](https://img.shields.io/pypi/v/sudachipy.svg)](https://pypi.python.org/pypi/sudachipy/) [![](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/) [Documentation](https://worksapplications.github.io/sudachi.rs/python) @@ -53,7 +54,6 @@ morphemes = tokenizer.tokenize("国会議事堂前駅", SplitMode.A) print([m.surface() for m in morphemes]) # ['国会', '議事', '堂', '前', '駅'] ``` - ## Setup You need SudachiPy and a dictionary. @@ -61,7 +61,7 @@ You need SudachiPy and a dictionary. ### Step 1. Install SudachiPy ```bash -$ pip install sudachipy +pip install sudachipy ``` ### Step 2. Get a Dictionary @@ -69,12 +69,11 @@ $ pip install sudachipy You can get dictionary as a Python package. It may take a while to download the dictionary file (around 70MB for the `core` edition). ```bash -$ pip install sudachidict_core +pip install sudachidict_core ``` Alternatively, you can choose other dictionary editions. See [this section](#dictionary-edition) for the detail. - ## Usage: As a command There is a CLI command `sudachipy`. @@ -113,8 +112,7 @@ optional arguments: -v, --version print sudachipy version ``` -__Note: The Debug option (`-d`) is disabled in version 0.6.0.__ - +**Note: The Debug option (`-d`) is disabled in version 0.6.\*** ### Output @@ -148,14 +146,12 @@ quei 名詞,普通名詞,一般,*,*,* quei quei -1 [] (OOV) EOS ``` - ## Usage: As a Python package ### API See [API reference page](https://worksapplications.github.io/sudachi.rs/python/). - ### Example ```python @@ -202,7 +198,6 @@ tokenizer_obj.tokenize("シュミレーション", mode)[0].normalized_form() (With `20210802` `core` dictionary. The results may change when you use other versions) - ## Dictionary Edition There are three editions of Sudachi Dictionary, namely, `small`, `core`, and `full`. See [WorksApplications/SudachiDict](https://github.com/WorksApplications/SudachiDict) for the detail. @@ -211,9 +206,9 @@ SudachiPy uses `sudachidict_core` by default. Dictionaries can be installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. -* [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/) -* [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/) -* [SudachiDict-full · PyPI](https://pypi.org/project/SudachiDict-full/) +- [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/) +- [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/) +- [SudachiDict-full · PyPI](https://pypi.org/project/SudachiDict-full/) The dictionary files are not in the package itself, but it is downloaded upon installation. @@ -231,7 +226,6 @@ $ pip install sudachidict_full $ echo "外国人参政権" | sudachipy -s full ``` - ### Dictionary option: Python package You can specify the dictionary with the `Dicionary()` argument; `config` or `dict`. @@ -241,12 +235,12 @@ class Dictionary(config=None, resource_dir=None, dict=None) ``` 1. `config` - * You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). - * If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary. + - You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). + - If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary. 2. `dict` - * You can also specify the dictionary type with `dict`. - * The available arguments are `small`, `core`, `full`, or a path to the dictionary file. - * If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config. + - You can also specify the dictionary type with `dict`. + - The available arguments are `small`, `core`, `full`, or a path to the dictionary file. + - If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config. ```python from sudachipy import Dictionary @@ -267,7 +261,6 @@ tokenizer_obj = Dictionary(dict="full").create() # sudachidict_full tokenizer_obj = Dictionary(config="/path/to/sudachi.json", dict="full").create() ``` - ### Dictionary in The Setting File Alternatively, if the dictionary file is specified in the setting file, `sudachi.json`, SudachiPy will use that file. @@ -285,7 +278,6 @@ The default setting file is [sudachi.json](https://github.com/WorksApplications/ $ sudachipy -r path/to/sudachi.json ``` - ## User Dictionary To use a user dictionary, `user.dic`, place [sudachi.json](https://github.com/WorksApplications/sudachi.rs/blob/develop/python/py_src/sudachipy/resources/sudachi.json) to anywhere you like, and add `userDict` value with the relative path from `sudachi.json` to your `user.dic`. @@ -325,7 +317,6 @@ required named arguments: About the dictionary file format, please refer to [this document](https://github.com/WorksApplications/Sudachi/blob/develop/docs/user_dict.md) (written in Japanese, English version is not available yet). - ## Customized System Dictionary ```bash @@ -361,7 +352,6 @@ Then specify your `sudachi.json` with the `-r` option. $ sudachipy -r path/to/sudachi.json ``` - ## For Developers ### Build from source @@ -370,10 +360,9 @@ $ sudachipy -r path/to/sudachi.json 1. Install python module `setuptools` and `setuptools-rust`. 2. Run `./build-sdist.sh` in `python` dir. - - source distribution will be generated under `python/dist/` dir. + - source distribution will be generated under `python/dist/` dir. 3. Install it via pip: `pip install ./python/dist/SudachiPy-[version].tar.gz` - #### Install develop build 1. Install python module `setuptools` and `setuptools-rust`. @@ -382,12 +371,10 @@ $ sudachipy -r path/to/sudachi.json ref: [setuptools-rust](https://github.com/PyO3/setuptools-rust) - ### Test Run `build_and_test.sh` to run the tests. - ## Contact Sudachi and SudachiPy are developed by [WAP Tokushima Laboratory of AI and NLP](http://nlp.worksap.co.jp/). diff --git a/python/build-wheels-manylinux-pgo.sh b/python/build-wheels-manylinux-pgo.sh index 428f6fdd..da0d12d8 100644 --- a/python/build-wheels-manylinux-pgo.sh +++ b/python/build-wheels-manylinux-pgo.sh @@ -1,6 +1,8 @@ #!/bin/bash set -ex +# This script is assumed to be used inside https://github.com/pypa/manylinux. + DIR=$(dirname "$(readlink -f "$0")") curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y --no-modify-path --component llvm-tools-preview @@ -31,7 +33,10 @@ cd "$DIR" export RUSTFLAGS='-C profile-use=/tmp/sudachi-profdata.merged -C opt-level=3' export CARGO_BUILD_TARGET=x86_64-unknown-linux-gnu -for PYBIN in /opt/python/cp{37,38,39,310,311,312}*/bin; do +# see following link for the list of cpython bin +# https://github.com/pypa/manylinux?tab=readme-ov-file#image-content +# TODO: after supporting py313t, "/opt/python/cp{37,38,39,310,311,312,313}-*/bin" would suffice. +for PYBIN in /opt/python/cp*-cp{37m,38,39,310,311,312,313}/bin; do "${PYBIN}/pip" install -U setuptools wheel setuptools-rust find . -iname 'sudachipy*.so' rm -f build/lib/sudachipy/sudachipy*.so diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index d744aa33..95efcb52 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -22,7 +22,7 @@ author = 'Works Applications' # The full version, including alpha/beta/rc tags -release = '0.6.9-a1' +release = '0.6.10-a1' # -- General configuration --------------------------------------------------- diff --git a/python/latest_dev_version.py b/python/latest_dev_version.py deleted file mode 100644 index 664c9da0..00000000 --- a/python/latest_dev_version.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2021 Works Applications Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Set the version in setup.py to the next unused .dev version -# Used versions are acquired directly from - -import json -import re -import sys -import urllib.request -from pathlib import Path - -cur_file = Path(__file__) - -setup_py = cur_file.parent / "setup.py" - -with setup_py.open("rt", encoding="utf-8") as f: - setup_py_data = f.read() - -version_re = re.compile('version="([^"]+)",') -cur_version = version_re.findall(setup_py_data) - -if len(cur_version) != 1: - print("could not find version", sys.stderr) - exit(1) - -cur_version = cur_version[0] - -print("Current version:", cur_version) - -if "dev" in cur_version: - print("Can't modify dev version") - exit(1) - -response = urllib.request.urlopen("https://test.pypi.org/pypi/SudachiPy/json") -data = json.loads(response.read()) - -remote_versions = set(data["releases"].keys()) - -remote_versions.add("0.6.0") # it was deleted - -next_version_re = re.compile("""^(.*)\.dev(\d+)$""") - - -def next_version(version): - m = next_version_re.match(version) - if m is None: - return version + ".dev1" - else: - p1 = m.group(1) - p2 = int(m.group(2)) - return "{}.dev{}".format(p1, p2 + 1) - - -print("Remote versions:", sorted(remote_versions)) - -next_v = next_version(cur_version) - -while next_v in remote_versions: - next_v = next_version(next_v) - -print("::notice::Next version:", next_v) - -modified_setup_py = version_re.sub('version="{}",'.format(next_v), setup_py_data, 1) - -with setup_py.open("wt", encoding='utf-8') as f: - f.write(modified_setup_py) diff --git a/python/modify_version_for_testpypi.py b/python/modify_version_for_testpypi.py new file mode 100755 index 00000000..e93ccd40 --- /dev/null +++ b/python/modify_version_for_testpypi.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021-2024 Works Applications Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Set the version in setup.py to the next unused version. +# This script is used to upload to TestPyPI (that does not allow same version) in python-upload-test workflow. +# +# 1. if current version has pre/post/dev part, increment the last part +# 2. if current version is final version, add post part +# +# we should avoid `.devN` if possible, since it's hard to handle version order with it. +# e.g. `1.2.dev1` < `1.2a1.dev1` < `1.2a1` < `1.2` +# ref: https://packaging.python.org/en/latest/specifications/version-specifiers/ + +import json +import re +import sys +import urllib.request +from pathlib import Path +from packaging.version import Version, InvalidVersion + +# find current version +cur_file = Path(__file__) +setup_py = cur_file.parent / "setup.py" + +with setup_py.open("rt", encoding="utf-8") as f: + setup_py_data = f.read() + +version_re = re.compile('version="([^"]+)",') +cur_version = version_re.findall(setup_py_data) + +if len(cur_version) != 1: + print("could not find version", sys.stderr) + exit(1) + +try: + cur_version = Version(cur_version[0]) + print("Current version:", cur_version) +except InvalidVersion: + print(f"{cur_version} is invalid as a python version") + exit(1) + +# find remote versions (in TestPyPI) +response = urllib.request.urlopen("https://test.pypi.org/pypi/SudachiPy/json") +data = json.loads(response.read()) + +remote_versions = set(data["releases"].keys()) + +# add deleted version to the list +remote_versions.add("0.6.0") + +print("Remote versions:", sorted(remote_versions)) + + +def increment_version(v: Version): + pre = v.pre + post = v.post + dev = v.dev + + if v.is_devrelease: + dev += 1 + elif v.is_postrelease: + post += 1 + elif v.is_prerelease: + pre = (pre[0], pre[1]+1) + else: # is final release + post = 1 + + next = v.base_version + \ + ("" if pre is None else f"{pre[0]}{pre[1]}") + \ + ("" if post is None else f".post{post}") + \ + ("" if dev is None else f".dev{dev}") + + assert Version(next) > v + return Version(next) + + +# search proper version to upload +next_v = cur_version + +while str(next_v) in remote_versions: + next_v = increment_version(next_v) + + +print("::notice::Next version:", next_v) + +modified_setup_py = version_re.sub( + 'version="{}",'.format(next_v), setup_py_data, 1) + +with setup_py.open("wt", encoding='utf-8') as f: + f.write(modified_setup_py) diff --git a/python/py_src/sudachipy/__init__.py b/python/py_src/sudachipy/__init__.py index fb551538..1f7d9338 100644 --- a/python/py_src/sudachipy/__init__.py +++ b/python/py_src/sudachipy/__init__.py @@ -14,7 +14,7 @@ from importlib.util import find_spec as _find_spec from pathlib import Path as _Path -__version__ = "0.6.9-a1" +__version__ = "0.6.10-a1" _DEFAULT_RESOURCEDIR = _Path(__file__).resolve().parent / 'resources' _DEFAULT_SETTINGFILE = _DEFAULT_RESOURCEDIR / 'sudachi.json' diff --git a/python/setup.py b/python/setup.py index b3eedcc5..ce3fcded 100644 --- a/python/setup.py +++ b/python/setup.py @@ -17,7 +17,7 @@ setup( name="SudachiPy", - version="0.6.9-a1", + version="0.6.10-a1", description="Python version of Sudachi, the Japanese Morphological Analyzer", long_description=open('README.md', encoding='utf-8').read(), long_description_content_type="text/markdown", diff --git a/update_version.sh b/update_version.sh index e882a3ed..d3991598 100755 --- a/update_version.sh +++ b/update_version.sh @@ -4,6 +4,7 @@ set -eu if [ $# -lt 1 ] || ( [ $# -lt 2 ] && [ "$1" != "show" ] ) ; then echo "Provide 2 arguments [from] and [to] to update version, or 'show' to print current one." + echo "Note that the version should follow semantic-versioning and PEP440, e.g. '1.2.3' or '1.2.3-a4'" exit 1 fi