findsimilar · quillcraftsman · Nov 9, 2023 · Nov 6, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,21 @@
+name: Lint
+
+on:
+  push:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Analysing the code with pylint
+      run: |
+        make lint
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -7,13 +7,15 @@ on:
 jobs:
   tests:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ "3.10", "3.11", '3.12' ]
     steps:
-    - name: Check out code
-      uses: actions/checkout@v3
-    - name: Set up Python
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
-        python-version: "3.10"
+        python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/.gitignore b/.gitignore
@@ -124,6 +124,7 @@ celerybeat.pid
 .venv
 env/
 venv/
+venv10/
 ENV/
 env.bak/
 venv.bak/
@@ -157,4 +158,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
+.idea/
+
+# uploads folder
+uploads/loaddata.xlsx
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,16 @@
+[MASTER]
+disable=
+    R0801, #  Too few public methods
+    E1101, #  Dynamic attributes
+    W0511, #  Used when a warning note
+    C0103, #  Pylint doesn't like id attribute
+    W0201, #  Attribute defined outside init method
+    R0903, #  Too few public methods
+    C0114, #  Module docstring
+    C0115, #  Class docstring
+    C0116, #  Method docstring
+    E1123, # Unexpected keyword argument
+    R0901, # Too many entities in file
+
+ignore-paths=.*/migrations
+
diff --git a/CHECKLIST.md b/CHECKLIST.md
@@ -29,10 +29,10 @@
 - [x] [Support](https://github.com/quillcraftsman/open-source-checklist#support)
 
 [CI and CD](https://github.com/quillcraftsman/open-source-checklist#ci-and-cd)
-- [ ] Tests
-- [ ] Test Coverage
-- [ ] Test Coverage 100%
-- [ ] Linters
+- [x] Tests
+- [x] Test Coverage
+- [x] Test Coverage 100%
+- [x] Linters
 - [ ] Build
 - [ ] Deploy
 - [ ] New User Greetings
diff --git a/Makefile b/Makefile
@@ -1,26 +1,19 @@
 make test:
 	python manage.py test
 
-test-proximity:
-	python manage.py test analysis.tests.tests_proximity
-
 server:
 	python manage.py runserver
 
 coverage:
 	coverage run --source='.' manage.py test
-	coverage html --omit=laboratory/asgi.py,laboratory/wsgi.py,manage.py,analysis/management/*
-	coverage report --omit=laboratory/asgi.py,laboratory/wsgi.py,manage.py,analysis/management/* --fail-under=100
-
+	coverage html --omit=laboratory/asgi.py,laboratory/wsgi.py,manage.py,*/management/*
+	coverage report --omit=laboratory/asgi.py,laboratory/wsgi.py,manage.py,*/management/* --fail-under=100
 
 migrate:
 	python manage.py migrate
 
-compare_two:
-	python manage.py compare_two "$(one)" "$(two)"
-
-example_frequency_analysis:
-	python manage.py example_frequency_analysis "$(example)"
+pylint:
+	pylint $(shell git ls-files '*.py')
 
-load_training_data:
-	python manage.py load_training_data $(name) $(filepath) $(sheet_name)
+lint:
+	make pylint
diff --git a/README.md b/README.md
@@ -53,32 +53,6 @@ make coverage
 make lint
 ```
 
-## Use find_similar core function
-
-Instead of:
-```python
-from find_similar import find_similar  # You will get import error in this case
-```
-
-Use:
-```python
-from django.conf import settings
-settings.FIND_SIMILAR('none', ['one', 'two'])
-settings.TOKENIZE('some text')
-```
-
-Or if you don't like UPPER_CASE:
-```python
-from django.conf import settings
-find_similar = settings.FIND_SIMILAR
-tokenize = settings.TOKENIZE
-
-find_similar('none', ['one', 'two'])
-tokenize('some text')
-```
-
-Looks weird, please make pull request if you find a better way
-
 ## Management commands
 
 ### Get tokens from one text
@@ -128,11 +102,6 @@ Done:
 End
 ```
 
-With make:
-```commandline
-make one="one" two="two" compare_two
-```
-
 ### Example frequency analysis
 
 Input:
@@ -149,11 +118,6 @@ Done:
 End
 ```
 
-With make:
-```commandline
-make example="mock" example_frequency_analysis
-```
-
 ### Load training data
 
 Input:
@@ -170,11 +134,6 @@ TrainingData object (None)
 End
 ```
 
-With make:
-```commandline
-make load_traning_data name=2x2 filepath=analysis/tests/data/2x2.xlsx sheet_name=0
-```
-
 ## FAQ
 
 Empty yet
diff --git a/analysis/admin.py b/analysis/admin.py
@@ -7,4 +7,4 @@
 
 # Register your models here.
 admin.site.register(TextToken)
-admin.site.register(Token)
+admin.site.register(Token)
diff --git a/analysis/forms.py b/analysis/forms.py
@@ -2,7 +2,6 @@
 Forms
 """
 from django import forms
-from django_find_similar.forms import FindSimilarForm
 
 
 class OneTextForm(forms.Form):
@@ -26,13 +25,13 @@ class TwoTextForm(forms.Form):
     }))
 
 
-class LoadTrainingDataForm(forms.Form):
-    name = forms.CharField(max_length=128, widget=forms.TextInput(attrs={
-        'class': 'form-control'
-    }))
-    excel_file = forms.FileField(max_length=128, widget=forms.FileInput(attrs={
-        'class': 'form-control'
-    }))
-    sheet_name = forms.IntegerField(required=False, initial=0, widget=forms.NumberInput(attrs={
-        'class': 'form-control'
-    }))
+# class LoadTrainingDataForm(forms.Form):
+#     name = forms.CharField(max_length=128, widget=forms.TextInput(attrs={
+#         'class': 'form-control'
+#     }))
+#     excel_file = forms.FileField(max_length=128, widget=forms.FileInput(attrs={
+#         'class': 'form-control'
+#     }))
+#     sheet_name = forms.IntegerField(required=False, initial=0, widget=forms.NumberInput(attrs={
+#         'class': 'form-control'
+#     }))
diff --git a/analysis/functions.py b/analysis/functions.py
@@ -1,64 +1,17 @@
 """
 Analysis functions
 """
-from django.conf import settings
-from .loaders import load_from_excel
-from .models import TrainingData, to_list
+from find_similar.tokenize import tokenize  # pylint: disable=import-error
+from find_similar.calc_functions import calc_cosine_similarity_opt  # pylint: disable=import-error
+from utils.decorators import Printer
 
 
-class Printer:
-    """
-    This class decorator save results to some place (default print its)
-    """
-
-    def __init__(self, title=None, printer=print):
-        """
-        Init
-        :title: callback with title -> title()
-        :printer: print function (default print)
-        """
-        self.title = title
-        self.printer = printer
-
-    def __call__(self, func):
-        """
-        Make decorator
-        :func: decorated function
-        """
-        def inner(*args, **kwargs):
-            """
-            New function
-            """
-            printer = kwargs.get('printer', self.printer)
-
-            if 'printer' in kwargs:
-                is_delete_printer = True
-                if 'is_pass_printer' in kwargs:
-                    if kwargs['is_pass_printer']:
-                        is_delete_printer = False
-                    del kwargs['is_pass_printer']
-
-                if is_delete_printer:
-                    del kwargs['printer']
-
-            printer('Start')
-            if self.title is not None:
-                printer(self.title(*args, **kwargs))
-            result = func(*args, **kwargs)
-            printer('Done:')
-            printer(result)
-            printer('End')
-            return result
-
-        return inner
-
-
-@Printer(title=lambda item, **kwargs: f'Get tokens for {item}...')
-def analyze_one_item(item, dictionary=None, language="russian"):
+@Printer(title=lambda text, **kwargs: f'Get tokens for {text}...')
+def analyze_one_item(text, language="english", remove_stopwords=True):
     """
     Analyze one item for tokenize
     """
-    tokens = settings.TOKENIZE(item, language=language, dictionary=dictionary)
+    tokens = tokenize(text, language=language, remove_stopwords=remove_stopwords)
     return tokens
 
 
@@ -69,59 +22,5 @@ def analyze_two_items(one, two, printer=print):
     """
     one_tokens = analyze_one_item(one, printer=printer)  # pylint: disable=unexpected-keyword-arg
     two_tokens = analyze_one_item(two, printer=printer)  # pylint: disable=unexpected-keyword-arg
-    cos = settings.CALC_COSINE_SIMILARITY_OPT(one_tokens, two_tokens)
+    cos = calc_cosine_similarity_opt(one_tokens, two_tokens)
     return cos
-
-
-@Printer(title=lambda example, **kwargs: f'Analyze "{example}"...')
-def example_frequency_analysis(example):
-    """
-    Example Frequency analysis
-    :example: Example name
-    """
-    result = settings.FREQUENCY_ANALYSIS(example)
-    return result
-
-
-@Printer(title=lambda name, filepath, sheet_name=0, **kwargs: f'Loading data from "{filepath}"...')
-def load_training_data(name, filepath, sheet_name=0):
-    dataframe = load_from_excel(filepath, sheet_name)
-    # TrainingData
-    training_data = TrainingData.objects.create(name=name, data=dataframe.to_json())
-    return training_data
-
-
-@Printer(title=lambda text, dataframe, find_similar, **kwargs: f'Find similar for "{text}" in "{dataframe}"...')
-def find_similar_dataframe(text, dataframe, find_similar, **kwargs):
-    texts = to_list(dataframe)
-    return find_similar(text, texts, **kwargs)
-
-
-def total_rating(to_search, match_list, find_similar):
-    results = {}
-    all_list = []
-    for line in match_list:
-        all_list += line
-
-    for search in to_search:
-        similars = find_similar(search, all_list)
-        print('search', search, 'similars', similars)
-        for line in match_list:
-            if search in line:
-                print('line', line)
-                line_count = len(line)
-                print('SEARCH', search)
-                print('similars', similars)
-                similars = similars[:line_count]
-                print('short similars', similars)
-                similars = [item['name'] for item in similars]
-                find_count = 0
-                for item in line:
-                    if item in similars:
-                        find_count += 1
-                result = f'{find_count}/{line_count}'
-                print('result', result)
-                results[search] = result
-
-    return results
-
diff --git a/analysis/migrations/0004_delete_trainingdata.py b/analysis/migrations/0004_delete_trainingdata.py
@@ -0,0 +1,16 @@
+# Generated by Django 4.2.6 on 2023-11-09 12:47
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('analysis', '0003_trainingdata_create_trainingdata_update'),
+    ]
+
+    operations = [
+        migrations.DeleteModel(
+            name='TrainingData',
+        ),
+    ]