Skip to content

Commit

Permalink
feat: tag clustering using ML
Browse files Browse the repository at this point in the history
  • Loading branch information
hwelsters committed Oct 5, 2023
1 parent c3bf4d4 commit d4a0b48
Show file tree
Hide file tree
Showing 10 changed files with 654 additions and 20 deletions.
16 changes: 8 additions & 8 deletions django/curator/management/commands/curator_clean_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def add_arguments(self, parser):

def handle_load(self, restore_directory):
path = restore_directory.joinpath(PENDING_TAG_CLEANUPS_FILENAME)
print("Loading data from path {}".format(str(path)))
logger.debug("Loading data from path %s", path)
tag_cleanups = TagCleanup.load(path)
TagCleanup.objects.bulk_create(tag_cleanups)

Expand All @@ -50,11 +50,11 @@ def handle_run(self):
def handle_view(self):
qs = TagCleanup.objects.filter(is_active=True)
if qs.count() > 0:
print("Tag Cleanups\n--------------------\n")
logger.debug("Tag Cleanups\n--------------------\n")
for tag_cleanup in qs.iterator():
print(tag_cleanup)
logger.debug(tag_cleanup)
else:
print("No Pending Tag Cleanups!")
logger.debug("No Pending Tag Cleanups!")

def handle(self, *args, **options):
run = options["run"]
Expand All @@ -70,11 +70,11 @@ def handle(self, *args, **options):
elif method:
self.handle_method(method)
elif dump:
print(
"Dumping tag curation data to {}".format(
load_directory.joinpath(PENDING_TAG_CLEANUPS_FILENAME)
)
logger.debug(
"Dumping tag curation data to %s",
load_directory.joinpath(PENDING_TAG_CLEANUPS_FILENAME),
)

TagCleanup.objects.dump(
load_directory.joinpath(PENDING_TAG_CLEANUPS_FILENAME)
)
Expand Down
64 changes: 64 additions & 0 deletions django/curator/management/commands/curator_cluster_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging

from django.core.management.base import BaseCommand

from curator.tag_deduplication import TagClusterer, TagClusterManager


class Command(BaseCommand):
help = """
Cluster Tags using dedupe. This command takes the rows of tags available in the database and clusters the tags together using Dedupe.
It takes in the rows available in the Tag table and attempts to create CanonicalTag objects that are stored in the database.
"""

def add_arguments(self, parser):
parser.add_argument(
"--label",
"-l",
help="label the training data for the clustering model using the console.",
action="store_true",
default=False,
)

parser.add_argument(
"--reset",
"-r",
help="""remove all unlabelled clusters from the database.""",
action="store_true",
default=False,
)

parser.add_argument(
"--threshold",
"-t",
help="""float between [0,1]. Blank defaults to 0.5.
Defines how much confidence to require from the model before tags are clustered.
Higher thresholds cluster less tags and require more training data labels.""",
default=0.5,
)

def handle(self, *args, **options):
"""
`curator_cluster_tags should be used only if the curator would like for a large amount of unlabelled tags to be clustered.
For individual tags, the TagGazetteer is more preferred.
"""
if TagClusterManager.has_unlabelled_clusters() and not options["reset"]:
logging.warn(
"There are still some unlabelled clusters. Finish labelling those using curator_edit_clusters or run this command with the --reset option to remove all unlabelled clusters."
)
return

TagClusterManager.reset()
tag_clusterer = TagClusterer(clustering_threshold=options["threshold"])

if options["label"]:
tag_clusterer.console_label()
tag_clusterer.save_to_training_file()

if not tag_clusterer.training_file_exists():
logging.warn(
"Your model does not have any labelled data. Run this command with --label and try again."
)

clusters = tag_clusterer.cluster_tags()
tag_clusterer.save_clusters(clusters)
16 changes: 16 additions & 0 deletions django/curator/management/commands/curator_edit_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import logging

from django.core.management.base import BaseCommand

from curator.tag_deduplication import TagClusterer, TagClusterManager


class Command(BaseCommand):
# TODO: Expand upon this
help = """
Edit clusters.
"""

def handle(self, *args, **options):
""" """
TagClusterManager.console_label()
77 changes: 77 additions & 0 deletions django/curator/management/commands/curator_map_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging

from django.core.management.base import BaseCommand
from taggit.models import Tag

from curator.tag_deduplication import TagGazetteer
from curator.models import CanonicalTag, CanonicalTagMapping

logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Matches tags to a canonical list of tags using dedupe. This command finds a canonical tag "

def add_arguments(self, parser):
parser.add_argument(
"--label",
"-l",
help="label the training data for the gazetteering model using the console",
action="store_true",
default=False,
)
parser.add_argument(
"--threshold",
"-t",
help="""float between [0,1]. Blank defaults to 0.5.
Defines how much confidence to require from the model before tags are selected from the canonical list.
Higher thresholds matches less tags to those in a canonical list and require more training data labels.""",
default=0.5,
)

def handle(self, *args, **options):
"""
`curator_gazetteer_tags` searches for CanonicalTags that most closely match a certain tag.
From a canonical list, the canonical tag that most closely matches is selected.
"""
if not CanonicalTag.objects.exists():
logger.warn(
"Canonical list is empty, populating canonical list using the curator_cluster_tags command"
)
return

tag_gazetteer = TagGazetteer(float(options["threshold"]))

if options["label"]:
tag_gazetteer.console_label()
tag_gazetteer.save_to_training_file()

if not tag_gazetteer.training_file_exists():
logging.warn(
"Your model does not have any labelled data. Run this command with --label and try again."
)

tags = Tag.objects.filter(canonicaltagmapping=None)
is_unmatched = False
for tag in tags:
matches = tag_gazetteer.text_search(tag.name)
if matches:
match = matches[0]
canonical_tag_mapping = CanonicalTagMapping(
tag=tag, canonical_tag=match[0], confidence_score=match[1]
)

is_correct = input(
f"Does the following mapping make sense?:\n{str(canonical_tag_mapping)}\n(y)es/(n)o\n"
)

if is_correct == "y":
print("Mapped tag!")
canonical_tag_mapping.save()
else:
is_unmatched = True

if is_unmatched:
logging.warn(
"There are some Tags that could not be matched to CanonicalTags. Either lower the threshold or increase the training data size."
)
16 changes: 16 additions & 0 deletions django/curator/management/commands/curator_modify_canon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import logging

from django.core.management.base import BaseCommand
from taggit.models import Tag

from curator.tag_deduplication import TagGazetteer, TagClusterManager
from curator.models import CanonicalTag, CanonicalTagMapping

logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Matches tags to a canonical list of tags using dedupe. This command finds a canonical tag "

def handle(self, *args, **options):
TagClusterManager.console_canonicalize_edit()
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Generated by Django 3.2.21 on 2023-10-05 18:30

from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('taggit', '0005_auto_20220424_2025'),
('curator', '0002_rename_tagcleanup_permission'),
]

operations = [
migrations.CreateModel(
name='CanonicalTag',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.TextField(unique=True)),
],
),
migrations.CreateModel(
name='TagCluster',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('canonical_tag_name', models.TextField()),
('confidence_score', models.FloatField()),
('date_created', models.DateTimeField(auto_now_add=True, null=True)),
('tags', models.ManyToManyField(to='taggit.Tag')),
],
),
migrations.CreateModel(
name='CanonicalTagMapping',
fields=[
('tag', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, primary_key=True, serialize=False, to='taggit.tag')),
('confidence_score', models.FloatField()),
('date_created', models.DateTimeField(auto_now=True)),
('canonical_tag', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='curator.canonicaltag')),
('curator', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
),
]
Loading

0 comments on commit d4a0b48

Please sign in to comment.