Skip to content

Commit

Permalink
add tokenize feature
Browse files Browse the repository at this point in the history
  • Loading branch information
quillcraftsman committed Nov 5, 2023
1 parent 980eb51 commit b08e261
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 42 deletions.
10 changes: 1 addition & 9 deletions analysis/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Forms
"""
from django import forms
from django_find_similar.forms import FindSimilarForm


class OneTextForm(forms.Form):
Expand Down Expand Up @@ -35,12 +36,3 @@ class LoadTrainingDataForm(forms.Form):
sheet_name = forms.IntegerField(required=False, initial=0, widget=forms.NumberInput(attrs={
'class': 'form-control'
}))


class FindSimilarForm(forms.Form):
"""
Form with one text
"""
text = forms.CharField(max_length=128, widget=forms.TextInput(attrs={
'class': 'form-control'
}))
10 changes: 10 additions & 0 deletions analysis/templates/analysis/tokenize.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{% extends "base.html" %}
{% block main %}
<form method="post">
{% csrf_token %}
{{form.as_p}}
<button type="submit" class="btn btn-primary">Tokenize</button>
</form>
{% endblock %}
{% block results %}
{% endblock %}
2 changes: 1 addition & 1 deletion analysis/templates/analysis/training_data.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ <h2>{{object.name}}</h2>
<a class="btn btn-primary" href="#">Total rating</a>
<a class="btn btn-info" href="#">One column rating</a>
<a class="btn btn-info" href="{% url 'analysis:find_similar' pk=object.pk %}">Find similar</a>
<a class="btn btn-second" href="#">Tokenize</a>
<a class="btn btn-second" href="{% url 'analysis:tokenize' %}">Tokenize</a>
<a class="btn btn-danger" href="{% url 'analysis:delete_training_data' pk=object.pk %}">Delete</a>
{% endblock %}
{% block results %}
Expand Down
4 changes: 4 additions & 0 deletions analysis/tests/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def test_reverse(self):
'url': 'clear_text_token',
'reverse': 'clear-text-token/',
},
{
'url': 'tokenize',
'reverse': 'tokenize/',
},
]
for url in urls:
app_url = f'{app_name}:{url["url"]}'
Expand Down
24 changes: 0 additions & 24 deletions analysis/tests/tests_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
OneTextForm,
TwoTextForm,
LoadTrainingDataForm,
FindSimilarForm,
)


Expand Down Expand Up @@ -80,26 +79,3 @@ def test_fields(self):

current_form = LoadTrainingDataForm()
self.assertTrueForm(current_form, true_form)


class TestFindSimilarForm(SimpleTestCase):
"""
One text form test
"""

def test_fields(self):
"""
Test available fields
"""

true_form = TrueForm(
fields=Fields(
count=1,
types={
'text': forms.CharField
}
)
)

current_form = FindSimilarForm()
self.assertTrueForm(current_form, true_form)
61 changes: 57 additions & 4 deletions analysis/tests/tests_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
from django.core.files.uploadedfile import SimpleUploadedFile
from django.urls import reverse
from django_find_similar.models import CheckResult, TextToken
from django_find_similar.models import CheckResult, TextToken, Token
from dry_tests import (
TestCase,
SimpleTestCase,
Expand All @@ -13,13 +13,12 @@
Context,
POST,
)
from django_find_similar.forms import FindSimilarForm
from django_find_similar.forms import FindSimilarForm, FindSimilarParamsForm
from mixer.backend.django import mixer
from analysis.forms import OneTextForm, TwoTextForm, LoadTrainingDataForm
from analysis.models import TrainingData
from analysis.tests.data import get_2x2_filepath, get_2x2_training_data
from analysis.urls import app_name
from analysis.functions import load_training_data


FORM_CONTENT_VALUES = [
Expand Down Expand Up @@ -691,4 +690,58 @@ def test_post(self):
self.assertTrueResponse(current_response, true_response)

# db state after
self.assertFalse(TextToken.objects.all().exists())
self.assertFalse(TextToken.objects.all().exists())


class TokenizeViewTestCase(TestCase):

def setUp(self):
self.url = reverse('analysis:tokenize')

def test_get(self):
request = Request(
url=self.url,
)

true_response = TrueResponse(
status_code=200,
context=Context(
types={
'form': FindSimilarParamsForm,
}
),
content_values=FORM_CONTENT_VALUES
)

current_response = request.get_response(self.client)
self.assertTrueResponse(current_response, true_response)

def test_post(self):

data = {
'language': 'english',
'remove_stopwords': True,
}

request = Request(
url=self.url,
method=POST,
data=data,
)

true_response = TrueResponse(
status_code=302,
redirect_url='/analysis/training-data-list/',
)

self.training_data = get_2x2_training_data()
# db before
self.assertFalse(TextToken.objects.all().exists())
self.assertFalse(Token.objects.all().exists())

current_response = request.get_response(self.client)
self.assertTrueResponse(current_response, true_response)

# db after
self.assertTrue(TextToken.objects.all().exists())
self.assertTrue(Token.objects.all().exists())
1 change: 1 addition & 0 deletions analysis/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
path('text-token/<int:pk>/', views.TextTokenDetailView.as_view(), name="text_token"),
path('clear-training-data/', views.clear_training_data, name="clear_training_data"),
path('clear-text-token/', views.clear_text_token, name="clear_text_token"),
path('tokenize/', views.TokenizeView.as_view(), name="tokenize"),
]
37 changes: 34 additions & 3 deletions analysis/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from django.views.generic import FormView, DetailView, ListView, DeleteView
from django.urls import reverse, reverse_lazy
from django.conf import settings
from django_find_similar.forms import FindSimilarForm
from django_find_similar.forms import FindSimilarForm, FindSimilarParamsForm
from django_find_similar.models import TextToken, TokenTextAdapter, CheckResult
from find_similar import find_similar

Expand All @@ -20,7 +20,8 @@
)
from .forms import (
OneTextForm,
TwoTextForm, LoadTrainingDataForm
TwoTextForm,
LoadTrainingDataForm,
)
from .models import TrainingData, to_list

Expand Down Expand Up @@ -263,4 +264,34 @@ def clear_text_token(request):
if request.method == 'POST':
TextToken.objects.all().delete()
return HttpResponseRedirect(reverse('analysis:text_token_list'))
return render(request, 'analysis/clear_data.html', context={'model_name': 'Text Tokens'})
return render(request, 'analysis/clear_data.html', context={'model_name': 'Text Tokens'})


class TokenizeView(FormView):
form_class = FindSimilarParamsForm
template_name = 'analysis/tokenize.html'
success_url = reverse_lazy('analysis:training_data_list')

def form_valid(self, form):
cleaned_data = form.cleaned_data
language = cleaned_data['language']
remove_stopwords = cleaned_data['remove_stopwords']
# Make all training data (In a future we shout get just one)
training_data_list = TrainingData.objects.all()
for training_data in training_data_list:
data_list = to_list(training_data.get_dataframe())

new_token_texts = []
for item in data_list:
item_text_token = TextToken(
text=item,
language=language,
remove_stopwords=remove_stopwords
)
new_token_texts.append(item_text_token)
TextToken.objects.bulk_create(new_token_texts, ignore_conflicts=True)

for text_token in TextToken.objects.all():
text_token.create_tokens()

return super().form_valid(form)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Django==4.2.6
django-dry-tests==1.0.0
django-find-similar==1.1.0
django-find-similar==1.2.0
pandas==2.1.1
openpyxl==3.1.2
coverage==7.3.2
Expand Down

0 comments on commit b08e261

Please sign in to comment.