Skip to content

Commit

Permalink
add some optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
quillcraftsman committed Nov 6, 2023
1 parent b08e261 commit 56526b2
Show file tree
Hide file tree
Showing 13 changed files with 85 additions and 28 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ celerybeat.pid
.venv
env/
venv/
venv10/
ENV/
env.bak/
venv.bak/
Expand Down Expand Up @@ -157,4 +158,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.idea/

# uploads folder
uploads/loaddata.xlsx
10 changes: 8 additions & 2 deletions analysis/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd
from django.db import models
from django.utils.functional import cached_property


class TrainingData(models.Model):
Expand All @@ -13,16 +14,21 @@ class TrainingData(models.Model):
create = models.DateTimeField(auto_now_add=True)
update = models.DateTimeField(auto_now=True)

@cached_property
def get_dataframe(self) -> pd.DataFrame:
return pd.read_json(StringIO(self.data), dtype=str)

@property
def columns_count(self):
return len(self.get_dataframe().columns)
return len(self.get_dataframe.columns)

@property
def rows_count(self):
return len(self.get_dataframe().index)
return len(self.get_dataframe.index)

def display_dataframe(self):
dataframe = self.get_dataframe
return dataframe.head(10)


def to_list(dataframe: pd.DataFrame) -> list:
Expand Down
2 changes: 1 addition & 1 deletion analysis/templates/analysis/find_similar.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ <h2>{{object.name}}</h2>
</form>
{% endblock %}
{% block results %}
{% with object.get_dataframe as data %}
{% with object.display_dataframe as data %}
<table class="table">
<tr>
{% for column in data.columns %}
Expand Down
2 changes: 1 addition & 1 deletion analysis/templates/analysis/training_data.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ <h2>{{object.name}}</h2>
<a class="btn btn-danger" href="{% url 'analysis:delete_training_data' pk=object.pk %}">Delete</a>
{% endblock %}
{% block results %}
{% with object.get_dataframe as data %}
{% with object.display_dataframe as data %}
<table class="table">
<tr>
{% for column in data.columns %}
Expand Down
6 changes: 3 additions & 3 deletions analysis/tests/tests_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def test_load_testing_data(self):
expected = get_2x2_expected_data()
result = load_training_data('first', filepath, sheet_name=0, printer=self.testing_printer)
self.assertTrue(isinstance(result, TrainingData))
self.assertTrue(expected.equals(result.get_dataframe()))
self.assertTrue(expected.equals(result.get_dataframe))

# prints
expected_prints = [
Expand All @@ -276,10 +276,10 @@ def find_similar_2x2(text, texts):

training_data = get_2x2_training_data()
text = '2'
dataframe = training_data.get_dataframe()
dataframe = training_data.get_dataframe
similars = find_similar_dataframe(
text,
training_data.get_dataframe(),
training_data.get_dataframe,
find_similar_2x2,
printer=self.testing_printer
)
Expand Down
6 changes: 3 additions & 3 deletions analysis/tests/tests_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def test_save(self):
self.assertTrue(self.dataframe.equals(get_data))

def test_data_from_json(self):
self.assertTrue(self.dataframe.equals(self.training_data.get_dataframe()))
self.assertTrue(self.dataframe.equals(self.training_data.get_dataframe))

def test_count(self):
self.assertEqual(len(self.training_data.get_dataframe().columns), 2)
self.assertEqual(len(self.training_data.get_dataframe().index), 2)
self.assertEqual(len(self.training_data.get_dataframe.columns), 2)
self.assertEqual(len(self.training_data.get_dataframe.index), 2)
self.assertEqual(self.training_data.columns_count, 2)
self.assertEqual(self.training_data.rows_count, 2)

Expand Down
2 changes: 1 addition & 1 deletion analysis/tests/tests_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def test_get(self):
self.training_data.name,
]

dataframe = self.training_data.get_dataframe()
dataframe = self.training_data.get_dataframe

# add headers
columns = dataframe.columns
Expand Down
58 changes: 44 additions & 14 deletions analysis/views.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Analysis views
"""
import cProfile
import os

from django.http import HttpResponseRedirect
Expand All @@ -9,8 +10,9 @@
from django.urls import reverse, reverse_lazy
from django.conf import settings
from django_find_similar.forms import FindSimilarForm, FindSimilarParamsForm
from django_find_similar.models import TextToken, TokenTextAdapter, CheckResult
from django_find_similar.models import TextToken, TokenTextAdapter, CheckResult, Token, CheckResultItem
from find_similar import find_similar
from find_similar.tokenize import tokenize

from analysis.functions import (
analyze_one_item,
Expand Down Expand Up @@ -149,7 +151,6 @@ def form_valid(self, form):
uploaded_path = self.handle_uploaded_file(excel_file)
name = data['name']
sheet_name = data.get('sheet_name', 0)
print('SHEET_NAME', sheet_name)
self.training_data = load_training_data(name=name, filepath=uploaded_path, sheet_name=sheet_name)
return super().form_valid(form)

Expand Down Expand Up @@ -208,7 +209,7 @@ def form_valid(self, form):

# save all data from dataset to TextToken
# self.object
data_list = to_list(self.object.get_dataframe())
data_list = to_list(self.object.get_dataframe)

new_token_texts = []
for item in data_list:
Expand All @@ -227,7 +228,7 @@ def form_valid(self, form):
result = find_similar(adapter, adapters, count=len(data_list))

# save results to the database
CheckResult.save_result(text_token, result)
# CheckResult.save_result(text_token, result)
return super().form_valid(form)


Expand All @@ -246,6 +247,10 @@ class TextTokenListView(ListView):
model = TextToken
template_name = 'analysis/text_token_list.html'
ordering = ['-create']
paginate_by = 3000

def get_queryset(self):
return TextToken.objects.prefetch_related('token_set').all()


class TextTokenDetailView(DetailView):
Expand All @@ -256,12 +261,19 @@ class TextTokenDetailView(DetailView):
def clear_training_data(request):
if request.method == 'POST':
TrainingData.objects.all().delete()
CheckResultItem.objects.all().delete()
Token.objects.all().delete()
CheckResult.objects.all().delete()
TextToken.objects.all().delete()
return HttpResponseRedirect(reverse('analysis:training_data_list'))
return render(request, 'analysis/clear_data.html', context={'model_name': 'Training Data'})


def clear_text_token(request):
if request.method == 'POST':
CheckResultItem.objects.all().delete()
Token.objects.all().delete()
CheckResult.objects.all().delete()
TextToken.objects.all().delete()
return HttpResponseRedirect(reverse('analysis:text_token_list'))
return render(request, 'analysis/clear_data.html', context={'model_name': 'Text Tokens'})
Expand All @@ -273,25 +285,43 @@ class TokenizeView(FormView):
success_url = reverse_lazy('analysis:training_data_list')

def form_valid(self, form):
# profiler = cProfile.Profile()
# profiler.enable()
cleaned_data = form.cleaned_data
language = cleaned_data['language']
remove_stopwords = cleaned_data['remove_stopwords']
# Make all training data (In a future we shout get just one)
training_data_list = TrainingData.objects.all()
all_token_texts = []
for training_data in training_data_list:
data_list = to_list(training_data.get_dataframe())

new_token_texts = []
data_list = to_list(training_data.get_dataframe)

for item in data_list:
item_text_token = TextToken(
all_token_texts.append(TextToken(
text=item,
language=language,
remove_stopwords=remove_stopwords
)
new_token_texts.append(item_text_token)
TextToken.objects.bulk_create(new_token_texts, ignore_conflicts=True)
))

for text_token in TextToken.objects.all():
text_token.create_tokens()

TextToken.objects.bulk_create(all_token_texts, ignore_conflicts=True)

all_token_texts = TextToken.objects.all()

all_tokens = []
# for text_token in TextToken.objects.all():
for text_token in all_token_texts:
# text_token.create_tokens()
token_set = tokenize(
text_token.text,
language=text_token.language,
remove_stopwords=text_token.remove_stopwords
)
# tokens = map(lambda text_str: Token(value=text_str, token_text=text_token), token_set)
# tokens = [Token(value=text_str, token_text=text_token) for text_str in token_set]
# all_tokens += tokens
for text_str in token_set:
all_tokens.append(Token(value=text_str, token_text=text_token))

Token.objects.bulk_create(all_tokens, ignore_conflicts=True)
# profiler.disable()
return super().form_valid(form)
7 changes: 7 additions & 0 deletions core/tests/test_numpy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from django.test import SimpleTestCase


class NumpyTestCase(SimpleTestCase):

def test_matrix_updates(self):
self.assertTrue(True)
8 changes: 8 additions & 0 deletions laboratory/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"django.contrib.staticfiles",
# others
'django_find_similar',
"debug_toolbar",
# My
'core',
'analysis',
Expand All @@ -62,6 +63,7 @@
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"debug_toolbar.middleware.DebugToolbarMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"django.middleware.clickjacking.XFrameOptionsMiddleware",
Expand Down Expand Up @@ -140,3 +142,9 @@
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field

DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

INTERNAL_IPS = [
# ...
"127.0.0.1",
# ...
]
1 change: 1 addition & 0 deletions laboratory/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
path("admin/", admin.site.urls),
path('', include('core.urls')),
path('analysis/', include('analysis.urls')),
path("__debug__/", include("debug_toolbar.urls")),
]
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
Django==4.2.6
django-dry-tests==1.0.0
django-find-similar==1.2.0
django-find-similar==1.3.0
pandas==2.1.1
openpyxl==3.1.2
coverage==7.3.2
mixer==7.2.2
mixer==7.2.2
django-debug-toolbar==4.2.0
Empty file added uploads/.empty
Empty file.

0 comments on commit 56526b2

Please sign in to comment.