From 084ec21a0292584f42f79930b6d6fe3bcf0ef08b Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Mon, 15 May 2023 23:43:19 -0400 Subject: [PATCH] feat: adjust models to properly support MySQL This codebase was originally developed and tested using only SQLite. In this commit, we're adding proper support for MySQL. In particular: * The size of ``key`` fields and titles were reduced from 1000 to 500. This is to accommodate MySQL's index size limit (3072 bytes which translates to 768 unicode code points in 4-byte encoding). Saving a little headroom for future compound indexes. * fields.py now has helper classes to support multiple collations so that we can specify utf8mb4 for the charset in MySQL. * fields.py now has helper functions that allow us to normalize case sensitivity across databases. By default, fields would otherwise be case sensitive in SQLite and case insensitive in MySQL. This is important for correctness, since ``key`` fields are meant to be be case sensitive for the purposes of uniqueness constraints. --- .../components/migrations/0001_initial.py | 149 ++----- openedx_learning/core/components/models.py | 10 +- .../core/contents/migrations/0001_initial.py | 112 ++--- openedx_learning/core/contents/models.py | 34 +- .../publishing/migrations/0001_initial.py | 383 ++++-------------- .../0002_add_publish_log_constraints.py | 24 -- openedx_learning/core/publishing/models.py | 9 +- openedx_learning/lib/collations.py | 116 ++++++ openedx_learning/lib/fields.py | 137 +++++-- setup.py | 10 +- 10 files changed, 425 insertions(+), 559 deletions(-) delete mode 100644 openedx_learning/core/publishing/migrations/0002_add_publish_log_constraints.py create mode 100644 openedx_learning/lib/collations.py diff --git a/openedx_learning/core/components/migrations/0001_initial.py b/openedx_learning/core/components/migrations/0001_initial.py index ac379c67..1d370747 100644 --- a/openedx_learning/core/components/migrations/0001_initial.py +++ b/openedx_learning/core/components/migrations/0001_initial.py @@ -1,153 +1,80 @@ -# Generated by Django 3.2.18 on 2023-05-11 02:07 +# Generated by Django 3.2.19 on 2023-06-15 14:43 from django.db import migrations, models import django.db.models.deletion +import openedx_learning.lib.fields import uuid class Migration(migrations.Migration): + initial = True dependencies = [ - ("oel_publishing", "0001_initial"), - ("oel_contents", "0001_initial"), + ('oel_publishing', '0001_initial'), + ('oel_contents', '0001_initial'), ] operations = [ migrations.CreateModel( - name="Component", + name='Component', fields=[ - ( - "publishable_entity", - models.OneToOneField( - on_delete=django.db.models.deletion.CASCADE, - primary_key=True, - serialize=False, - to="oel_publishing.publishableentity", - ), - ), - ("namespace", models.CharField(max_length=100)), - ("type", models.CharField(blank=True, max_length=100)), - ("local_key", models.CharField(max_length=255)), - ( - "learning_package", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="oel_publishing.learningpackage", - ), - ), + ('publishable_entity', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, primary_key=True, serialize=False, to='oel_publishing.publishableentity')), + ('namespace', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=100)), + ('type', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=100)), + ('local_key', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=500)), + ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')), ], options={ - "verbose_name": "Component", - "verbose_name_plural": "Components", + 'verbose_name': 'Component', + 'verbose_name_plural': 'Components', }, ), migrations.CreateModel( - name="ComponentVersion", + name='ComponentVersion', fields=[ - ( - "publishable_entity_version", - models.OneToOneField( - on_delete=django.db.models.deletion.CASCADE, - primary_key=True, - serialize=False, - to="oel_publishing.publishableentityversion", - ), - ), - ( - "component", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - related_name="versions", - to="oel_components.component", - ), - ), + ('publishable_entity_version', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, primary_key=True, serialize=False, to='oel_publishing.publishableentityversion')), + ('component', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='versions', to='oel_components.component')), ], options={ - "verbose_name": "Component Version", - "verbose_name_plural": "Component Versions", + 'verbose_name': 'Component Version', + 'verbose_name_plural': 'Component Versions', }, ), migrations.CreateModel( - name="ComponentVersionRawContent", + name='ComponentVersionRawContent', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "uuid", - models.UUIDField( - default=uuid.uuid4, - editable=False, - unique=True, - verbose_name="UUID", - ), - ), - ("key", models.CharField(max_length=255)), - ("learner_downloadable", models.BooleanField(default=False)), - ( - "component_version", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="oel_components.componentversion", - ), - ), - ( - "raw_content", - models.ForeignKey( - on_delete=django.db.models.deletion.RESTRICT, - to="oel_contents.rawcontent", - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), + ('key', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=500)), + ('learner_downloadable', models.BooleanField(default=False)), + ('component_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_components.componentversion')), + ('raw_content', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='oel_contents.rawcontent')), ], ), migrations.AddField( - model_name="componentversion", - name="raw_contents", - field=models.ManyToManyField( - related_name="component_versions", - through="oel_components.ComponentVersionRawContent", - to="oel_contents.RawContent", - ), + model_name='componentversion', + name='raw_contents', + field=models.ManyToManyField(related_name='component_versions', through='oel_components.ComponentVersionRawContent', to='oel_contents.RawContent'), ), migrations.AddIndex( - model_name="componentversionrawcontent", - index=models.Index( - fields=["raw_content", "component_version"], - name="oel_cvrawcontent_c_cv", - ), + model_name='componentversionrawcontent', + index=models.Index(fields=['raw_content', 'component_version'], name='oel_cvrawcontent_c_cv'), ), migrations.AddIndex( - model_name="componentversionrawcontent", - index=models.Index( - fields=["component_version", "raw_content"], - name="oel_cvrawcontent_cv_d", - ), + model_name='componentversionrawcontent', + index=models.Index(fields=['component_version', 'raw_content'], name='oel_cvrawcontent_cv_d'), ), migrations.AddConstraint( - model_name="componentversionrawcontent", - constraint=models.UniqueConstraint( - fields=("component_version", "key"), name="oel_cvrawcontent_uniq_cv_key" - ), + model_name='componentversionrawcontent', + constraint=models.UniqueConstraint(fields=('component_version', 'key'), name='oel_cvrawcontent_uniq_cv_key'), ), migrations.AddIndex( - model_name="component", - index=models.Index( - fields=["learning_package", "namespace", "type", "local_key"], - name="oel_component_idx_lc_ns_t_lk", - ), + model_name='component', + index=models.Index(fields=['learning_package', 'namespace', 'type', 'local_key'], name='oel_component_idx_lc_ns_t_lk'), ), migrations.AddConstraint( - model_name="component", - constraint=models.UniqueConstraint( - fields=("learning_package", "namespace", "type", "local_key"), - name="oel_component_uniq_lc_ns_t_lk", - ), + model_name='component', + constraint=models.UniqueConstraint(fields=('learning_package', 'namespace', 'type', 'local_key'), name='oel_component_uniq_lc_ns_t_lk'), ), ] diff --git a/openedx_learning/core/components/models.py b/openedx_learning/core/components/models.py index b3a848f7..2d2f2139 100644 --- a/openedx_learning/core/components/models.py +++ b/openedx_learning/core/components/models.py @@ -18,7 +18,11 @@ """ from django.db import models -from openedx_learning.lib.fields import key_field, immutable_uuid_field +from openedx_learning.lib.fields import ( + case_sensitive_char_field, + immutable_uuid_field, + key_field, +) from ..publishing.models import LearningPackage from ..publishing.model_mixins import ( PublishableEntityMixin, @@ -77,13 +81,13 @@ class Component(PublishableEntityMixin): # namespace and type work together to help figure out what Component needs # to handle this data. A namespace is *required*. The namespace for XBlocks # is "xblock.v1" (to match the setup.py entrypoint naming scheme). - namespace = models.CharField(max_length=100, null=False, blank=False) + namespace = case_sensitive_char_field(max_length=100, blank=False) # type is a way to help sub-divide namespace if that's convenient. This # field cannot be null, but it can be blank if it's not necessary. For an # XBlock, type corresponds to tag, e.g. "video". It's also the block_type in # the UsageKey. - type = models.CharField(max_length=100, null=False, blank=True) + type = case_sensitive_char_field(max_length=100, blank=True) # local_key is an identifier that is local to the (namespace, type). The # publishable.key should be calculated as a combination of (namespace, type, diff --git a/openedx_learning/core/contents/migrations/0001_initial.py b/openedx_learning/core/contents/migrations/0001_initial.py index f106553e..76873038 100644 --- a/openedx_learning/core/contents/migrations/0001_initial.py +++ b/openedx_learning/core/contents/migrations/0001_initial.py @@ -1,103 +1,75 @@ -# Generated by Django 3.2.18 on 2023-05-11 02:07 +# Generated by Django 3.2.19 on 2023-06-15 14:43 import django.core.validators from django.db import migrations, models import django.db.models.deletion +import openedx_learning.lib.fields import openedx_learning.lib.validators +def use_compressed_table_format(apps, schema_editor): + """ + Use the COMPRESSED row format for TextContent if we're using MySQL. + + This table will hold a lot of OLX, which compresses very well using MySQL's + built-in zlib compression. This is especially important because we're + keeping so much version history. + """ + if schema_editor.connection.vendor == 'mysql': + table_name = apps.get_model("oel_contents", "TextContent")._meta.db_table + sql = f"ALTER TABLE {table_name} ROW_FORMAT=COMPRESSED;" + schema_editor.execute(sql) + + class Migration(migrations.Migration): + initial = True dependencies = [ - ("oel_publishing", "0001_initial"), + ('oel_publishing', '0001_initial'), ] operations = [ migrations.CreateModel( - name="RawContent", + name='RawContent', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("hash_digest", models.CharField(editable=False, max_length=40)), - ("mime_type", models.CharField(max_length=255)), - ( - "size", - models.PositiveBigIntegerField( - validators=[django.core.validators.MaxValueValidator(50000000)] - ), - ), - ( - "created", - models.DateTimeField( - validators=[ - openedx_learning.lib.validators.validate_utc_datetime - ] - ), - ), - ("file", models.FileField(null=True, upload_to="")), - ( - "learning_package", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="oel_publishing.learningpackage", - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('hash_digest', models.CharField(editable=False, max_length=40)), + ('mime_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=255)), + ('size', models.PositiveBigIntegerField(validators=[django.core.validators.MaxValueValidator(50000000)])), + ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), + ('file', models.FileField(null=True, upload_to='')), + ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')), ], options={ - "verbose_name": "Raw Content", - "verbose_name_plural": "Raw Contents", + 'verbose_name': 'Raw Content', + 'verbose_name_plural': 'Raw Contents', }, ), migrations.CreateModel( - name="TextContent", + name='TextContent', fields=[ - ( - "raw_content", - models.OneToOneField( - on_delete=django.db.models.deletion.CASCADE, - primary_key=True, - related_name="text_content", - serialize=False, - to="oel_contents.rawcontent", - ), - ), - ("text", models.TextField(blank=True, max_length=100000)), - ("length", models.PositiveIntegerField()), + ('raw_content', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, primary_key=True, related_name='text_content', serialize=False, to='oel_contents.rawcontent')), + ('text', openedx_learning.lib.fields.MultiCollationTextField(blank=True, db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=100000)), + ('length', models.PositiveIntegerField()), ], ), + # Call out to custom code here to change row format for TextContent + migrations.RunPython(use_compressed_table_format, reverse_code=migrations.RunPython.noop, atomic=False), migrations.AddIndex( - model_name="rawcontent", - index=models.Index( - fields=["learning_package", "mime_type"], - name="oel_content_idx_lp_mime_type", - ), + model_name='rawcontent', + index=models.Index(fields=['learning_package', 'mime_type'], name='oel_content_idx_lp_mime_type'), ), migrations.AddIndex( - model_name="rawcontent", - index=models.Index( - fields=["learning_package", "-size"], name="oel_content_idx_lp_rsize" - ), + model_name='rawcontent', + index=models.Index(fields=['learning_package', '-size'], name='oel_content_idx_lp_rsize'), ), migrations.AddIndex( - model_name="rawcontent", - index=models.Index( - fields=["learning_package", "-created"], - name="oel_content_idx_lp_rcreated", - ), + model_name='rawcontent', + index=models.Index(fields=['learning_package', '-created'], name='oel_content_idx_lp_rcreated'), ), migrations.AddConstraint( - model_name="rawcontent", - constraint=models.UniqueConstraint( - fields=("learning_package", "mime_type", "hash_digest"), - name="oel_content_uniq_lc_mime_type_hash_digest", - ), + model_name='rawcontent', + constraint=models.UniqueConstraint(fields=('learning_package', 'mime_type', 'hash_digest'), name='oel_content_uniq_lc_mime_type_hash_digest'), ), ] diff --git a/openedx_learning/core/contents/models.py b/openedx_learning/core/contents/models.py index e19f151c..6794f1b2 100644 --- a/openedx_learning/core/contents/models.py +++ b/openedx_learning/core/contents/models.py @@ -5,9 +5,15 @@ """ from django.db import models from django.conf import settings +from django.core.files.storage import default_storage from django.core.validators import MaxValueValidator -from openedx_learning.lib.fields import hash_field, manual_date_time_field +from openedx_learning.lib.fields import ( + case_insensitive_char_field, + hash_field, + manual_date_time_field, + MultiCollationTextField, +) from ..publishing.models import LearningPackage @@ -73,11 +79,13 @@ class RawContent(models.Model): # MIME type, such as "text/html", "image/png", etc. Per RFC 4288, MIME type # and sub-type may each be 127 chars, making a max of 255 (including the "/" - # in between). + # in between). Also, while MIME types are almost always written in lowercase + # as a matter of convention, by spec they are NOT case sensitive. # # DO NOT STORE parameters here, e.g. "charset=". We can make a new field if - # that becomes necessary. - mime_type = models.CharField(max_length=255, blank=False, null=False) + # that becomes necessary. If we do decide to store parameters and values + # later, note that those *may be* case sensitive. + mime_type = case_insensitive_char_field(max_length=255, blank=False) # This is the size of the raw data file in bytes. This can be different than # the character length, since UTF-8 encoding can use anywhere between 1-4 @@ -96,10 +104,7 @@ class RawContent(models.Model): # models that offer better latency guarantees. file = models.FileField( null=True, - storage=settings.OPENEDX_LEARNING.get( - "STORAGE", - settings.DEFAULT_FILE_STORAGE, - ), + storage=settings.OPENEDX_LEARNING.get("STORAGE", default_storage), ) class Meta: @@ -172,5 +177,16 @@ class TextContent(models.Model): primary_key=True, related_name="text_content", ) - text = models.TextField(null=False, blank=True, max_length=MAX_TEXT_LENGTH) + text = MultiCollationTextField( + blank=True, + max_length=MAX_TEXT_LENGTH, + + # We don't really expect to ever sort by the text column. This is here + # primarily to force the column to be created as utf8mb4 on MySQL. I'm + # using the binary collation because it's a little cheaper/faster. + db_collations={ + "sqlite": "BINARY", + "mysql": "utf8mb4_bin", + } + ) length = models.PositiveIntegerField(null=False) diff --git a/openedx_learning/core/publishing/migrations/0001_initial.py b/openedx_learning/core/publishing/migrations/0001_initial.py index 76e02b31..3f41cd45 100644 --- a/openedx_learning/core/publishing/migrations/0001_initial.py +++ b/openedx_learning/core/publishing/migrations/0001_initial.py @@ -1,14 +1,16 @@ -# Generated by Django 3.2.18 on 2023-05-11 02:07 +# Generated by Django 3.2.19 on 2023-06-15 14:43 from django.conf import settings import django.core.validators from django.db import migrations, models import django.db.models.deletion +import openedx_learning.lib.fields import openedx_learning.lib.validators import uuid class Migration(migrations.Migration): + initial = True dependencies = [ @@ -17,360 +19,145 @@ class Migration(migrations.Migration): operations = [ migrations.CreateModel( - name="LearningPackage", + name='LearningPackage', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "uuid", - models.UUIDField( - default=uuid.uuid4, - editable=False, - unique=True, - verbose_name="UUID", - ), - ), - ("key", models.CharField(max_length=255)), - ("title", models.CharField(max_length=1000)), - ( - "created", - models.DateTimeField( - validators=[ - openedx_learning.lib.validators.validate_utc_datetime - ] - ), - ), - ( - "updated", - models.DateTimeField( - validators=[ - openedx_learning.lib.validators.validate_utc_datetime - ] - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), + ('key', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=500)), + ('title', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=500)), + ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), + ('updated', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), ], options={ - "verbose_name": "Learning Package", - "verbose_name_plural": "Learning Packages", + 'verbose_name': 'Learning Package', + 'verbose_name_plural': 'Learning Packages', }, ), migrations.CreateModel( - name="PublishableEntity", + name='PublishableEntity', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "uuid", - models.UUIDField( - default=uuid.uuid4, - editable=False, - unique=True, - verbose_name="UUID", - ), - ), - ("key", models.CharField(max_length=255)), - ( - "created", - models.DateTimeField( - validators=[ - openedx_learning.lib.validators.validate_utc_datetime - ] - ), - ), - ( - "created_by", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - to=settings.AUTH_USER_MODEL, - ), - ), - ( - "learning_package", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="oel_publishing.learningpackage", - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), + ('key', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=500)), + ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), + ('created_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), + ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')), ], options={ - "verbose_name": "Publishable Entity", - "verbose_name_plural": "Publishable Entities", + 'verbose_name': 'Publishable Entity', + 'verbose_name_plural': 'Publishable Entities', }, ), migrations.CreateModel( - name="PublishableEntityVersion", + name='PublishableEntityVersion', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "uuid", - models.UUIDField( - default=uuid.uuid4, - editable=False, - unique=True, - verbose_name="UUID", - ), - ), - ("title", models.CharField(blank=True, default="", max_length=1000)), - ( - "version_num", - models.PositiveBigIntegerField( - validators=[django.core.validators.MinValueValidator(1)] - ), - ), - ( - "created", - models.DateTimeField( - validators=[ - openedx_learning.lib.validators.validate_utc_datetime - ] - ), - ), - ( - "created_by", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - to=settings.AUTH_USER_MODEL, - ), - ), - ( - "entity", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - related_name="versions", - to="oel_publishing.publishableentity", - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), + ('title', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, default='', max_length=500)), + ('version_num', models.PositiveBigIntegerField(validators=[django.core.validators.MinValueValidator(1)])), + ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), + ('created_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), + ('entity', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='versions', to='oel_publishing.publishableentity')), ], options={ - "verbose_name": "Publishable Entity Version", - "verbose_name_plural": "Publishable Entity Versions", + 'verbose_name': 'Publishable Entity Version', + 'verbose_name_plural': 'Publishable Entity Versions', }, ), migrations.CreateModel( - name="PublishLog", + name='PublishLog', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "uuid", - models.UUIDField( - default=uuid.uuid4, - editable=False, - unique=True, - verbose_name="UUID", - ), - ), - ("message", models.CharField(blank=True, default="", max_length=1000)), - ( - "published_at", - models.DateTimeField( - validators=[ - openedx_learning.lib.validators.validate_utc_datetime - ] - ), - ), - ( - "learning_package", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="oel_publishing.learningpackage", - ), - ), - ( - "published_by", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - to=settings.AUTH_USER_MODEL, - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), + ('message', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, default='', max_length=500)), + ('published_at', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), + ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')), + ('published_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), ], options={ - "verbose_name": "Publish Log", - "verbose_name_plural": "Publish Logs", + 'verbose_name': 'Publish Log', + 'verbose_name_plural': 'Publish Logs', }, ), migrations.CreateModel( - name="Draft", + name='Draft', fields=[ - ( - "entity", - models.OneToOneField( - on_delete=django.db.models.deletion.RESTRICT, - primary_key=True, - serialize=False, - to="oel_publishing.publishableentity", - ), - ), + ('entity', models.OneToOneField(on_delete=django.db.models.deletion.RESTRICT, primary_key=True, serialize=False, to='oel_publishing.publishableentity')), ], ), migrations.CreateModel( - name="Published", + name='Published', fields=[ - ( - "entity", - models.OneToOneField( - on_delete=django.db.models.deletion.RESTRICT, - primary_key=True, - serialize=False, - to="oel_publishing.publishableentity", - ), - ), + ('entity', models.OneToOneField(on_delete=django.db.models.deletion.RESTRICT, primary_key=True, serialize=False, to='oel_publishing.publishableentity')), ], options={ - "verbose_name": "Published Entity", - "verbose_name_plural": "Published Entities", + 'verbose_name': 'Published Entity', + 'verbose_name_plural': 'Published Entities', }, ), migrations.CreateModel( - name="PublishLogRecord", + name='PublishLogRecord', fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "entity", - models.ForeignKey( - on_delete=django.db.models.deletion.RESTRICT, - to="oel_publishing.publishableentity", - ), - ), - ( - "new_version", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.RESTRICT, - to="oel_publishing.publishableentityversion", - ), - ), - ( - "old_version", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.RESTRICT, - related_name="+", - to="oel_publishing.publishableentityversion", - ), - ), - ( - "publish_log", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="oel_publishing.publishlog", - ), - ), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('entity', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='oel_publishing.publishableentity')), + ('new_version', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.RESTRICT, to='oel_publishing.publishableentityversion')), + ('old_version', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.RESTRICT, related_name='+', to='oel_publishing.publishableentityversion')), + ('publish_log', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.publishlog')), ], options={ - "verbose_name": "Publish Log Record", - "verbose_name_plural": "Publish Log Records", + 'verbose_name': 'Publish Log Record', + 'verbose_name_plural': 'Publish Log Records', }, ), migrations.AddConstraint( - model_name="learningpackage", - constraint=models.UniqueConstraint( - fields=("key",), name="oel_publishing_lp_uniq_key" - ), + model_name='learningpackage', + constraint=models.UniqueConstraint(fields=('key',), name='oel_publishing_lp_uniq_key'), + ), + migrations.AddIndex( + model_name='publishlogrecord', + index=models.Index(fields=['entity', '-publish_log'], name='oel_plr_idx_entity_rplr'), + ), + migrations.AddConstraint( + model_name='publishlogrecord', + constraint=models.UniqueConstraint(fields=('publish_log', 'entity'), name='oel_plr_uniq_pl_publishable'), ), migrations.AddField( - model_name="published", - name="publish_log_record", - field=models.ForeignKey( - on_delete=django.db.models.deletion.RESTRICT, - to="oel_publishing.publishlogrecord", - ), + model_name='published', + name='publish_log_record', + field=models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='oel_publishing.publishlogrecord'), ), migrations.AddField( - model_name="published", - name="version", - field=models.OneToOneField( - null=True, - on_delete=django.db.models.deletion.RESTRICT, - to="oel_publishing.publishableentityversion", - ), + model_name='published', + name='version', + field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.RESTRICT, to='oel_publishing.publishableentityversion'), ), migrations.AddIndex( - model_name="publishableentityversion", - index=models.Index( - fields=["entity", "-created"], name="oel_pv_idx_entity_rcreated" - ), + model_name='publishableentityversion', + index=models.Index(fields=['entity', '-created'], name='oel_pv_idx_entity_rcreated'), ), migrations.AddIndex( - model_name="publishableentityversion", - index=models.Index(fields=["title"], name="oel_pv_idx_title"), + model_name='publishableentityversion', + index=models.Index(fields=['title'], name='oel_pv_idx_title'), ), migrations.AddConstraint( - model_name="publishableentityversion", - constraint=models.UniqueConstraint( - fields=("entity", "version_num"), name="oel_pv_uniq_entity_version_num" - ), + model_name='publishableentityversion', + constraint=models.UniqueConstraint(fields=('entity', 'version_num'), name='oel_pv_uniq_entity_version_num'), ), migrations.AddIndex( - model_name="publishableentity", - index=models.Index(fields=["key"], name="oel_pub_ent_idx_key"), + model_name='publishableentity', + index=models.Index(fields=['key'], name='oel_pub_ent_idx_key'), ), migrations.AddIndex( - model_name="publishableentity", - index=models.Index( - fields=["learning_package", "-created"], - name="oel_pub_ent_idx_lp_rcreated", - ), + model_name='publishableentity', + index=models.Index(fields=['learning_package', '-created'], name='oel_pub_ent_idx_lp_rcreated'), ), migrations.AddConstraint( - model_name="publishableentity", - constraint=models.UniqueConstraint( - fields=("learning_package", "key"), name="oel_pub_ent_uniq_lp_key" - ), + model_name='publishableentity', + constraint=models.UniqueConstraint(fields=('learning_package', 'key'), name='oel_pub_ent_uniq_lp_key'), ), migrations.AddField( - model_name="draft", - name="version", - field=models.OneToOneField( - blank=True, - null=True, - on_delete=django.db.models.deletion.RESTRICT, - to="oel_publishing.publishableentityversion", - ), + model_name='draft', + name='version', + field=models.OneToOneField(blank=True, null=True, on_delete=django.db.models.deletion.RESTRICT, to='oel_publishing.publishableentityversion'), ), ] diff --git a/openedx_learning/core/publishing/migrations/0002_add_publish_log_constraints.py b/openedx_learning/core/publishing/migrations/0002_add_publish_log_constraints.py deleted file mode 100644 index 986683c4..00000000 --- a/openedx_learning/core/publishing/migrations/0002_add_publish_log_constraints.py +++ /dev/null @@ -1,24 +0,0 @@ -# Generated by Django 3.2.19 on 2023-05-14 13:58 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("oel_publishing", "0001_initial"), - ] - - operations = [ - migrations.AddIndex( - model_name="publishlogrecord", - index=models.Index( - fields=["entity", "-publish_log"], name="oel_plr_idx_entity_rplr" - ), - ), - migrations.AddConstraint( - model_name="publishlogrecord", - constraint=models.UniqueConstraint( - fields=("publish_log", "entity"), name="oel_plr_uniq_pl_publishable" - ), - ), - ] diff --git a/openedx_learning/core/publishing/models.py b/openedx_learning/core/publishing/models.py index 0f9a50fd..78f19b4b 100644 --- a/openedx_learning/core/publishing/models.py +++ b/openedx_learning/core/publishing/models.py @@ -17,8 +17,9 @@ from django.core.validators import MinValueValidator from openedx_learning.lib.fields import ( - key_field, + case_insensitive_char_field, immutable_uuid_field, + key_field, manual_date_time_field, ) @@ -32,7 +33,7 @@ class LearningPackage(models.Model): uuid = immutable_uuid_field() key = key_field() - title = models.CharField(max_length=1000, null=False, blank=False) + title = case_insensitive_char_field(max_length=500, blank=False) created = manual_date_time_field() updated = manual_date_time_field() @@ -213,7 +214,7 @@ class PublishableEntityVersion(models.Model): # Most publishable things will have some sort of title, but blanks are # allowed for those that don't require one. - title = models.CharField(max_length=1000, default="", null=False, blank=True) + title = case_insensitive_char_field(max_length=500, blank=True, default="") # The version_num starts at 1 and increments by 1 with each new version for # a given PublishableEntity. Doing it this way makes it more convenient for @@ -339,7 +340,7 @@ class PublishLog(models.Model): uuid = immutable_uuid_field() learning_package = models.ForeignKey(LearningPackage, on_delete=models.CASCADE) - message = models.CharField(max_length=1000, null=False, blank=True, default="") + message = case_insensitive_char_field(max_length=500, blank=True, default="") published_at = manual_date_time_field() published_by = models.ForeignKey( settings.AUTH_USER_MODEL, diff --git a/openedx_learning/lib/collations.py b/openedx_learning/lib/collations.py new file mode 100644 index 00000000..2afdcc37 --- /dev/null +++ b/openedx_learning/lib/collations.py @@ -0,0 +1,116 @@ +""" +This module has collation-related code to allow us to attach collation settings +to specific fields on a per-database-vendor basis. This used by the ``fields`` +module in order to specify field types have have normalized behavior between +SQLite and MySQL (see fields.py for more details). +""" +from django.db import models + + +class MultiCollationMixin: + """ + Mixin to enable multiple, database-vendor-specific collations. + + This should be mixed into new subclasses of CharField and TextField, since + they're the only Field types that store text data. + """ + + def __init__(self, *args, db_collations=None, db_collation=None, **kwargs): + """ + Init like any field but add db_collations and disallow db_collation + + The ``db_collations`` param should be a dict of vendor names to + collations, like:: + + { + 'msyql': 'utf8mb4_bin', + 'sqlite': 'BINARY' + } + + It is an error to pass in a CharField-style ``db_collation``. I + originally wanted to use this attribute name, but I needed to preserve + it for Django 3.2 compatibility (see the ``db_collation`` method + docstring for details). + """ + if db_collation is not None: + raise ValueError( + f"Cannot use db_collation with {self.__class__.__name__}. " + + "Please use a db_collations dict instead." + ) + + super().__init__(*args, **kwargs) + self.db_collations = db_collations or {} + + # This is part of a hack to get this to work for Django < 4.1. Please + # see comments in the db_collation method for details. + self._vendor = None + + @property + def db_collation(self): + """ + Return the db_collation, understanding that it varies by vendor. + + This method is a hack for Django 3.2 compatibility and should be removed + after we move to 4.2. + + Description of why this is hacky: + + In Django 4.2, the schema builder pulls the collation settings from the + field using the value returned from the ``db_parameters`` method, and + this does what we want it to do. In Django 3.2, field.db_parameters is + called, but any collation value sent back is ignored and the code grabs + the value of db_collation directly from the field: + + https://github.com/django/django/blob/stable/3.2.x/django/db/backends/base/schema.py#L214-L224 + + But this call to get the ``field.db_collation`` attribute happens almost + immediately after the ``field.db_parameters`` method call. So our + fragile hack is to set ``self._vendor`` in the ``db_parameters`` method, + using the value we get from the connection that is passed in there. We + can then use ``self._vendor`` to return the right value when Django + calls ``field.db_collation`` (which is this property method). + + This method, the corresponding setter, and all references to + ``self._vendor`` should be removed after we've cut over to Django 4.2. + """ + return self.db_collations.get(self._vendor) + + @db_collation.setter + def db_collation(self, value): + """ + Don't allow db_collation to be set manually (just ignore). + + This can be removed when we move to Django 4.2. + """ + pass + + def db_parameters(self, connection): + """ + Return database parameters for this field. This adds collation info. + + We examine this field's ``db_collations`` attribute and return the + collation that maps to ``connection.vendor``. This will typically be + 'mysql' or 'sqlite'. + """ + db_params = models.Field.db_parameters(self, connection) + + # Remove once we no longer need to support Django < 4.1 + self._vendor = connection.vendor + + # Now determine collation based on DB vendor (e.g. 'sqlite', 'mysql') + if connection.vendor in self.db_collations: + db_params["collation"] = self.db_collations[connection.vendor] + + return db_params + + def deconstruct(self): + """ + How to serialize our Field for the migration file. + + For our mixin fields, this is just doing what the field's superclass + would do and then tacking on our custom ``db_collations`` dict data. + """ + name, path, args, kwargs = super().deconstruct() + if self.db_collations: + kwargs["db_collations"] = self.db_collations + return name, path, args, kwargs diff --git a/openedx_learning/lib/fields.py b/openedx_learning/lib/fields.py index 71273afa..0e231e3a 100644 --- a/openedx_learning/lib/fields.py +++ b/openedx_learning/lib/fields.py @@ -1,51 +1,82 @@ """ Convenience functions to make consistent field conventions easier. -Field conventions: - -* Per OEP-38, we're using the MySQL-friendly convention of BigInt ID as a - primary key + separate UUID column. +Per OEP-38, we're using the MySQL-friendly convention of BigInt ID as a +primary key + separate UUID column. https://open-edx-proposals.readthedocs.io/en/latest/best-practices/oep-0038-Data-Modeling.html -* The UUID fields are intended to be globally unique identifiers that other - services can store and rely on staying the same. -* The "key" fields can be more human-friendly strings, but these may only - be unique within a given scope. These values should be treated as mutable, - even if they rarely change in practice. - -TODO: -* Try making a CaseSensitiveCharField and CaseInsensitiveCharField -* Investigate more efficient UUID binary encoding + search in MySQL - -Other data thoughts: -* It would be good to make a data-dumping sort of script that exported part of - the data to SQLite3. -* identifiers support stable import/export with serialized formats -* UUIDs will import/export in the SQLite3 dumps, but are not there in other contexts + +We have helpers to make case sensitivity consistent across backends. MySQL is +case-insensitive by default, SQLite and Postgres are case-sensitive. + + """ import hashlib import uuid from django.db import models +from .collations import MultiCollationMixin from .validators import validate_utc_datetime -def key_field(): +def create_hash_digest(data_bytes): + return hashlib.blake2b(data_bytes, digest_size=20).hexdigest() + + +def case_insensitive_char_field(**kwargs): """ - Externally created Identifier fields. + Return a case-insensitive ``MultiCollationCharField``. - These will often be local to a particular scope, like within a - LearningPackage. It's up to the application as to whether they're - semantically meaningful or look more machine-generated. + This means that entries will sort in a case-insensitive manner, and that + unique indexes will be case insensitive, e.g. you would not be able to + insert "abc" and "ABC" into the same table field if you put a unique index + on this field. - Other apps should *not* make references to these values directly, since - these values may in theory change. + You may override any argument that you would normally pass into + ``MultiCollationCharField`` (which is itself a subclass of ``CharField``). """ - return models.CharField( - max_length=255, - blank=False, - null=False, - ) + # Set our default arguments + final_kwargs = { + "null": False, + "db_collations": { + "sqlite": "NOCASE", + # We're using utf8mb4_unicode_ci to keep MariaDB compatibility, + # since their collation support diverges after this. MySQL is now on + # utf8mb4_0900_ai_ci based on Unicode 9, while MariaDB has + # uca1400_ai_ci based on Unicode 14. + "mysql": "utf8mb4_unicode_ci", + }, + } + # Override our defaults with whatever is passed in. + final_kwargs.update(kwargs) + + return MultiCollationCharField(**final_kwargs) + + +def case_sensitive_char_field(**kwargs): + """ + Return a case-sensitive ``MultiCollationCharField``. + + This means that entries will sort in a case-sensitive manner, and that + unique indexes will be case sensitive, e.g. "abc" and "ABC" would be + distinct and you would not get a unique constraint violation by adding them + both to the same table field. + + You may override any argument that you would normally pass into + ``MultiCollationCharField`` (which is itself a subclass of ``CharField``). + """ + # Set our default arguments + final_kwargs = { + "null": False, + "db_collations": { + "sqlite": "BINARY", + "mysql": "utf8mb4_bin", + }, + } + # Override our defaults with whatever is passed in. + final_kwargs.update(kwargs) + + return MultiCollationCharField(**final_kwargs) def immutable_uuid_field(): @@ -66,6 +97,20 @@ def immutable_uuid_field(): ) +def key_field(): + """ + Externally created Identifier fields. + + These will often be local to a particular scope, like within a + LearningPackage. It's up to the application as to whether they're + semantically meaningful or look more machine-generated. + + Other apps should *not* make references to these values directly, since + these values may in theory change (even if this is rare in practice). + """ + return case_sensitive_char_field(max_length=500, blank=False) + + def hash_field(): """ Holds a hash digest meant to identify a piece of content. @@ -85,10 +130,6 @@ def hash_field(): ) -def create_hash_digest(data_bytes): - return hashlib.blake2b(data_bytes, digest_size=20).hexdigest() - - def manual_date_time_field(): """ DateTimeField that does not auto-generate values. @@ -117,3 +158,29 @@ def manual_date_time_field(): validate_utc_datetime, ], ) + + +class MultiCollationCharField(MultiCollationMixin, models.CharField): + """ + CharField subclass with per-database-vendor collation settings. + + Django's CharField already supports specifying the database collation, but + that only works with a single value. So there would be no way to say, "Use + utf8mb4_bin for MySQL, and BINARY if we're running SQLite." This is a + problem because we run tests in SQLite (and may potentially run more later). + It's also a problem if we ever want to support other database backends, like + PostgreSQL. Even MariaDB is starting to diverge from MySQL in terms of what + collations are supported. + """ + pass + + +class MultiCollationTextField(MultiCollationMixin, models.TextField): + """ + TextField subclass with per-database-vendor collation settings. + + We don't ever really want to _sort_ by a TextField, but setting a collation + forces the compatible charset to be set in MySQL, and that's the part that + matters for our purposes. So for example, if you set + """ + pass diff --git a/setup.py b/setup.py index 4fef021c..ab642620 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import re import sys -from setuptools import setup +from setuptools import find_packages, setup def get_version(*file_paths): @@ -72,10 +72,10 @@ def is_requirement(line): author='David Ormsbee', author_email='dave@tcril.org', url='https://github.com/openedx/openedx-learning', - packages=[ - 'openedx_learning', - 'openedx_tagging', - ], + packages=find_packages( + include=['openedx_learning*', 'openedx_tagging*'], + exclude=['*.test', '*.tests'] + ), include_package_data=True, install_requires=load_requirements('requirements/base.in'), python_requires=">=3.8",