diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4708333..1f121ef 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,6 +1,5 @@ name: Build on: [push, pull_request] - jobs: lint: if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') @@ -25,7 +24,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: [3.7, 3.8, 3.9] os: [ubuntu-latest , macos-latest, windows-latest] steps: - name: Checkout @@ -36,9 +35,10 @@ jobs: python-version: ${{ matrix.python-version }} - name: Run image uses: abatilo/actions-poetry@v2.0.0 - - name: Install latest rust + - name: Install Rust uses: actions-rs/toolchain@v1 with: + profile: minimal toolchain: stable override: true - name: Install dependencies @@ -46,4 +46,4 @@ jobs: - name: Build Python package run: poetry run maturin develop - name: Test - run: poetry run pytest tests + run: poetry run pytest -Werror tests diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f93feb5..accf916 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,15 +1,14 @@ name: Deploy on: release: - types: [published] - + types: [released] jobs: deploy: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: [3.7, 3.8, 3.9] os: [ubuntu-latest, macos-latest, windows-latest] steps: - name: Checkout @@ -18,15 +17,29 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - name: Install latest rust + - name: Install Rust uses: actions-rs/toolchain@v1 with: + profile: minimal toolchain: stable override: true - - name: Install dependencies - run: | - python -m pip install --upgrade pip maturin - - name: Build & Publish to PyPi - run: maturin publish --username __token__ --no-sdist --interpreter python${{matrix.python_version}} --manylinux=2014 + - name: Publish Package + if: matrix.os != 'windows-latest' + uses: messense/maturin-action@v1 + with: + maturin-version: latest + command: publish + manylinux: 2014 + args: --username=__token__ --no-sdist --interpreter=python${{ matrix.python-version }} + env: + MATURIN_PASSWORD: ${{ secrets.pypi_password }} + - name: Publish Package + if: matrix.os == 'windows-latest' + uses: messense/maturin-action@v1 + with: + maturin-version: latest + command: publish + manylinux: 2014 + args: --username=__token__ --no-sdist --interpreter=python env: MATURIN_PASSWORD: ${{ secrets.pypi_password }} diff --git a/Cargo.toml b/Cargo.toml index 440b3ff..27c355b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastzy" -version = "0.3.1" +version = "0.3.2" authors = ["Gal Ben David "] edition = "2018" description = "Python library for fast fuzzy search over a big file written in Rust" @@ -8,16 +8,19 @@ readme = "README.md" repository = "https://github.com/intsights/fastzy" homepage = "https://github.com/intsights/fastzy" license = "MIT" -keywords = ["fuzzy", "levenshtein", "rust"] +keywords = [ + "fuzzy", + "levenshtein", + "rust", +] [package.metadata.maturin] -requires-python = ">=3.6" +requires-python = ">=3.7" classifier = [ "License :: OSI Approved :: MIT License", "Operating System :: MacOS", "Operating System :: Microsoft", "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", @@ -33,7 +36,7 @@ parking_lot = "0.11" rayon = "1.5" [dependencies.pyo3] -version = "0.13.1" +version = "0.13.2" features = ["extension-module"] [profile.release] diff --git a/LICENSE b/LICENSE index cf5d6d8..6f3ba7c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 Gal Ben David +Copyright (c) 2021 Gal Ben David Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/pyproject.toml b/pyproject.toml index d793d26..1111fad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ strip = true [tool.poetry] name = "fastzy" -version = "0.3.1" +version = "0.3.2" authors = ["Gal Ben David "] description = "Python library for fast fuzzy search over a big file written in Rust" readme = "README.md" @@ -31,7 +31,6 @@ classifiers = [ "Operating System :: MacOS", "Operating System :: Microsoft", "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", @@ -39,7 +38,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = "^3.6" +python = "^3.7" [tool.poetry.dev-dependencies] pytest = "*" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 31df213..0000000 --- a/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[aliases] -test=pytest - -[tool:pytest] -addopts = --tb=native -s -Wall diff --git a/src/lib.rs b/src/lib.rs index 74628d7..9b356fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,18 +21,17 @@ const WAGNER_FISCHER_ARR_INIT: [usize;100] = [ 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99 ]; -const MBLEVEN_MATRIX: [&[u8];63] = [ - b"re", b"", b"", b"", b"", b"", b"", - b"de", b"", b"", b"", b"", b"", b"", - b"rre", b"ide", b"die", b"", b"", b"", b"", - b"rde", b"dre", b"", b"", b"", b"", b"", - b"dde", b"", b"", b"", b"", b"", b"", - b"rrre", b"idre", b"irde", b"ride", b"rdie", b"drie", b"dire", - b"rrde", b"rdre", b"drre", b"idde", b"dide", b"ddie", b"", - b"rdde", b"drde", b"ddre", b"", b"", b"", b"", - b"ddde", b"", b"", b"", b"", b"", b"", +const MBLEVEN_MATRIX: [u8;72] = [ + 3, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 15, 9, 6, 0, 0, 0, 0, 0, + 13, 7, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0, + 63, 39, 45, 57, 54, 30, 27, 0, + 61, 55, 31, 37, 25, 22, 0, 0, + 53, 29, 23, 0, 0, 0, 0, 0, + 21, 0, 0, 0, 0, 0, 0, 0, ]; -const MATRIX_ROW_INDEX: [usize;3] = [0, 2, 5]; #[pyclass] #[text_signature = "(file_path, separator, /)"] @@ -65,23 +64,21 @@ impl Searcher { let input_file = BufReader::new(input_file); let mut prefix_len; - for line in input_file.lines() { - if let Ok(line) = line { - if separator.is_empty() { - prefix_len = line.len(); - } else if let Some(separator_pos) = line.find(separator) { - prefix_len = separator_pos; - } else { - prefix_len = line.len(); - } + for line in input_file.lines().flatten() { + if separator.is_empty() { + prefix_len = line.len(); + } else if let Some(separator_pos) = line.find(separator) { + prefix_len = separator_pos; + } else { + prefix_len = line.len(); + } - if max_length < prefix_len { - max_length = prefix_len; - } - let index = indices.entry(prefix_len).or_insert(String::new()); - index.push_str(&line); - index.push('\n'); + if max_length < prefix_len { + max_length = prefix_len; } + let index = indices.entry(prefix_len).or_insert_with(String::new); + index.push_str(&line); + index.push('\n'); } Ok( @@ -140,68 +137,83 @@ impl Searcher { } #[staticmethod] - fn mbleven( - first_string: &str, - second_string: &str, + fn mbleven<'a>( + mut first_string: &'a str, + mut second_string: &'a str, max_distance: usize, ) -> bool { - let mut i: usize; - let mut j: usize; - let mut c: usize; - - let longer_str; - let shorter_str; - if first_string.len() > second_string.len() { - longer_str = first_string; - shorter_str = second_string; - } else { - longer_str = second_string; - shorter_str = first_string; + let mut m: u8; + let mut differences: usize; + + if max_distance == 0 { + return first_string == second_string; } - let matrix_row_index = if max_distance == 0 { - 0 - } else { - max_distance - 1 - }; + let mut first_string_len = first_string.chars().count(); + let mut second_string_len = second_string.chars().count(); - let row = MATRIX_ROW_INDEX[matrix_row_index] + (longer_str.len() - shorter_str.len()); - for col in 0..7 { - let model = MBLEVEN_MATRIX[row * 7 + col]; - if model.is_empty() { - break; - } + if first_string_len < second_string_len { + std::mem::swap(&mut first_string, &mut second_string); + std::mem::swap(&mut first_string_len, &mut second_string_len); + } - i = 0; - j = 0; - c = 0; - - while i < longer_str.len() && j < shorter_str.len() && c <= max_distance { - if longer_str.as_bytes()[i] != shorter_str.as_bytes()[j] { - match model[c] { - b'd' => { - i += 1; - }, - b'r' => { - i += 1; - j += 1; - }, - b'i' => { - j += 1; - }, - b'e' => { - c = max_distance + 1; - }, - _ => (), - } - c += 1; - } else { - i += 1; - j += 1; + let strings_len_difference = first_string_len - second_string_len; + if max_distance < strings_len_difference { + return false; + } + + let mut pos: usize = ((max_distance + max_distance.pow(2)) / 2 - 1 + strings_len_difference) * 8; + while MBLEVEN_MATRIX[pos] > 0 { + m = MBLEVEN_MATRIX[pos]; + pos += 1; + differences = 0; + + let mut first_string_chars = first_string.chars(); + let mut second_string_chars = second_string.chars(); + let mut first_string_current_char = first_string_chars.next(); + let mut second_string_current_char = second_string_chars.next(); + + loop { + match (first_string_current_char, second_string_current_char) { + (Some(first_string_char), Some(second_string_char)) => { + if first_string_char != second_string_char { + differences += 1; + + if m == 0 { + differences += first_string_chars.count() + second_string_chars.count() + 2; + + break; + } + if m & 1 > 0 { + first_string_current_char = first_string_chars.next(); + } + if m & 2 > 0 { + second_string_current_char = second_string_chars.next(); + } + + m >>= 2; + } else { + first_string_current_char = first_string_chars.next(); + second_string_current_char = second_string_chars.next(); + } + }, + (Some(_first_string_char), None) => { + differences += first_string_chars.count() + 1; + + break; + }, + (None, Some(_second_string_char)) => { + differences += second_string_chars.count() + 1; + + break; + }, + (None, None) => { + break; + }, } } - if c + (longer_str.len() - i) + (shorter_str.len() - j) <= max_distance { + if differences <= max_distance { return true; } } @@ -219,23 +231,28 @@ impl Searcher { let mut dia: usize; let mut tmp: usize; - for i in 1..first_string.len() + 1 { - dia = i - 1; - arr[0] = i; + if max_distance == 0 { + return first_string == second_string; + } + + for (i, first_string_current_char) in first_string.chars().enumerate() { + dia = i; + arr[0] = i + 1; - for j in 1..second_string.len() + 1 { - tmp = arr[j]; + for (j, second_string_current_char) in second_string.chars().enumerate() { + tmp = arr[j + 1]; - if first_string.as_bytes()[i - 1] != second_string.as_bytes()[j - 1] { - arr[j] = min(min(arr[j], arr[j - 1]), dia) + 1; + if first_string_current_char != second_string_current_char { + arr[j + 1] = min(min(arr[j + 1], arr[j]), dia) + 1; } else { - arr[j] = dia; + arr[j + 1] = dia; } + dia = tmp; } } - arr[second_string.len()] <= max_distance + arr[second_string.chars().count()] <= max_distance } } diff --git a/tests/test_fastzy.py b/tests/test_fastzy.py index eccae74..2b97fd8 100644 --- a/tests/test_fastzy.py +++ b/tests/test_fastzy.py @@ -218,6 +218,12 @@ def test_wagner_fischer( self.assertTrue( expr=fastzy.Searcher.wagner_fischer('kitten', 'mittens', 2), ) + self.assertTrue( + expr=fastzy.Searcher.wagner_fischer('אבא', 'אמא', 1), + ) + self.assertTrue( + expr=fastzy.Searcher.wagner_fischer('אﺑא', 'אמא', 1), + ) self.assertFalse( expr=fastzy.Searcher.wagner_fischer('1234', '1', 2), @@ -312,6 +318,12 @@ def test_mbleven( self.assertTrue( expr=fastzy.Searcher.mbleven('kitten', 'mittens', 2), ) + self.assertTrue( + expr=fastzy.Searcher.mbleven('אבא', 'אמא', 1), + ) + self.assertTrue( + expr=fastzy.Searcher.mbleven('אﺑא', 'אמא', 1), + ) self.assertFalse( expr=fastzy.Searcher.mbleven('1234', '1', 2),