-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
654 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
64 changes: 64 additions & 0 deletions
64
django/curator/management/commands/curator_cluster_tags.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import logging | ||
|
||
from django.core.management.base import BaseCommand | ||
|
||
from curator.tag_deduplication import TagClusterer, TagClusterManager | ||
|
||
|
||
class Command(BaseCommand): | ||
help = """ | ||
Cluster Tags using dedupe. This command takes the rows of tags available in the database and clusters the tags together using Dedupe. | ||
It takes in the rows available in the Tag table and attempts to create CanonicalTag objects that are stored in the database. | ||
""" | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument( | ||
"--label", | ||
"-l", | ||
help="label the training data for the clustering model using the console.", | ||
action="store_true", | ||
default=False, | ||
) | ||
|
||
parser.add_argument( | ||
"--reset", | ||
"-r", | ||
help="""remove all unlabelled clusters from the database.""", | ||
action="store_true", | ||
default=False, | ||
) | ||
|
||
parser.add_argument( | ||
"--threshold", | ||
"-t", | ||
help="""float between [0,1]. Blank defaults to 0.5. | ||
Defines how much confidence to require from the model before tags are clustered. | ||
Higher thresholds cluster less tags and require more training data labels.""", | ||
default=0.5, | ||
) | ||
|
||
def handle(self, *args, **options): | ||
""" | ||
`curator_cluster_tags should be used only if the curator would like for a large amount of unlabelled tags to be clustered. | ||
For individual tags, the TagGazetteer is more preferred. | ||
""" | ||
if TagClusterManager.has_unlabelled_clusters() and not options["reset"]: | ||
logging.warn( | ||
"There are still some unlabelled clusters. Finish labelling those using curator_edit_clusters or run this command with the --reset option to remove all unlabelled clusters." | ||
) | ||
return | ||
|
||
TagClusterManager.reset() | ||
tag_clusterer = TagClusterer(clustering_threshold=options["threshold"]) | ||
|
||
if options["label"]: | ||
tag_clusterer.console_label() | ||
tag_clusterer.save_to_training_file() | ||
|
||
if not tag_clusterer.training_file_exists(): | ||
logging.warn( | ||
"Your model does not have any labelled data. Run this command with --label and try again." | ||
) | ||
|
||
clusters = tag_clusterer.cluster_tags() | ||
tag_clusterer.save_clusters(clusters) |
16 changes: 16 additions & 0 deletions
16
django/curator/management/commands/curator_edit_clusters.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import logging | ||
|
||
from django.core.management.base import BaseCommand | ||
|
||
from curator.tag_deduplication import TagClusterer, TagClusterManager | ||
|
||
|
||
class Command(BaseCommand): | ||
# TODO: Expand upon this | ||
help = """ | ||
Edit clusters. | ||
""" | ||
|
||
def handle(self, *args, **options): | ||
""" """ | ||
TagClusterManager.console_label() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import logging | ||
|
||
from django.core.management.base import BaseCommand | ||
from taggit.models import Tag | ||
|
||
from curator.tag_deduplication import TagGazetteer | ||
from curator.models import CanonicalTag, CanonicalTagMapping | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Matches tags to a canonical list of tags using dedupe. This command finds a canonical tag " | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument( | ||
"--label", | ||
"-l", | ||
help="label the training data for the gazetteering model using the console", | ||
action="store_true", | ||
default=False, | ||
) | ||
parser.add_argument( | ||
"--threshold", | ||
"-t", | ||
help="""float between [0,1]. Blank defaults to 0.5. | ||
Defines how much confidence to require from the model before tags are selected from the canonical list. | ||
Higher thresholds matches less tags to those in a canonical list and require more training data labels.""", | ||
default=0.5, | ||
) | ||
|
||
def handle(self, *args, **options): | ||
""" | ||
`curator_gazetteer_tags` searches for CanonicalTags that most closely match a certain tag. | ||
From a canonical list, the canonical tag that most closely matches is selected. | ||
""" | ||
if not CanonicalTag.objects.exists(): | ||
logger.warn( | ||
"Canonical list is empty, populating canonical list using the curator_cluster_tags command" | ||
) | ||
return | ||
|
||
tag_gazetteer = TagGazetteer(float(options["threshold"])) | ||
|
||
if options["label"]: | ||
tag_gazetteer.console_label() | ||
tag_gazetteer.save_to_training_file() | ||
|
||
if not tag_gazetteer.training_file_exists(): | ||
logging.warn( | ||
"Your model does not have any labelled data. Run this command with --label and try again." | ||
) | ||
|
||
tags = Tag.objects.filter(canonicaltagmapping=None) | ||
is_unmatched = False | ||
for tag in tags: | ||
matches = tag_gazetteer.text_search(tag.name) | ||
if matches: | ||
match = matches[0] | ||
canonical_tag_mapping = CanonicalTagMapping( | ||
tag=tag, canonical_tag=match[0], confidence_score=match[1] | ||
) | ||
|
||
is_correct = input( | ||
f"Does the following mapping make sense?:\n{str(canonical_tag_mapping)}\n(y)es/(n)o\n" | ||
) | ||
|
||
if is_correct == "y": | ||
print("Mapped tag!") | ||
canonical_tag_mapping.save() | ||
else: | ||
is_unmatched = True | ||
|
||
if is_unmatched: | ||
logging.warn( | ||
"There are some Tags that could not be matched to CanonicalTags. Either lower the threshold or increase the training data size." | ||
) |
16 changes: 16 additions & 0 deletions
16
django/curator/management/commands/curator_modify_canon.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import logging | ||
|
||
from django.core.management.base import BaseCommand | ||
from taggit.models import Tag | ||
|
||
from curator.tag_deduplication import TagGazetteer, TagClusterManager | ||
from curator.models import CanonicalTag, CanonicalTagMapping | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Matches tags to a canonical list of tags using dedupe. This command finds a canonical tag " | ||
|
||
def handle(self, *args, **options): | ||
TagClusterManager.console_canonicalize_edit() |
44 changes: 44 additions & 0 deletions
44
django/curator/migrations/0003_canonicaltag_canonicaltagmapping_tagcluster.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Generated by Django 3.2.21 on 2023-10-05 18:30 | ||
|
||
from django.conf import settings | ||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
migrations.swappable_dependency(settings.AUTH_USER_MODEL), | ||
('taggit', '0005_auto_20220424_2025'), | ||
('curator', '0002_rename_tagcleanup_permission'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='CanonicalTag', | ||
fields=[ | ||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('name', models.TextField(unique=True)), | ||
], | ||
), | ||
migrations.CreateModel( | ||
name='TagCluster', | ||
fields=[ | ||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('canonical_tag_name', models.TextField()), | ||
('confidence_score', models.FloatField()), | ||
('date_created', models.DateTimeField(auto_now_add=True, null=True)), | ||
('tags', models.ManyToManyField(to='taggit.Tag')), | ||
], | ||
), | ||
migrations.CreateModel( | ||
name='CanonicalTagMapping', | ||
fields=[ | ||
('tag', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, primary_key=True, serialize=False, to='taggit.tag')), | ||
('confidence_score', models.FloatField()), | ||
('date_created', models.DateTimeField(auto_now=True)), | ||
('canonical_tag', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='curator.canonicaltag')), | ||
('curator', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), | ||
], | ||
), | ||
] |
Oops, something went wrong.