Skip to content

Commit

Permalink
Search: stop relying on the DB when indexing (#10696)
Browse files Browse the repository at this point in the history
- Removed the "wipe" actions from the admin instead of porting them, since I'm not sure that we need an action in the admin just to delete the search index of a project. Re-index seems useful.
- `fileify` was replaced by `index_build`, and it only requires the build id to be passed, any other information can be retrieved from the build/version object.
- `fileify` isn't removed in this PR to avoid downtimes during deploy, it's safe to keep it around till next deploy.
- New code is avoiding any deep connection to the django-elasticsearch-dsl package, since it doesn't make sense anymore to have it, and I'm planning on removing it.
- We are no longer tracking all files in the DB, only the ones of interest.
- Re-indexing a version will also re-evaluate the files from the DB, useful for old projects that are out of sync.
- The reindex command now generates taks per-version rather than per-collection of files, since we no longer track all files in the DB.


- Closes #10623
- Closes #10690

We don't need to do anything special during deploy, zero downtime out of the box. We can trigger a re-index for all versions if we want to delete the HTML files that we don't need from the DB, but that operation will also re-index their contents in ES, so probably better do that after we are all settled with any changes to ES.
  • Loading branch information
stsewd authored Sep 14, 2023
1 parent 31f0da5 commit fa54900
Show file tree
Hide file tree
Showing 16 changed files with 587 additions and 435 deletions.
2 changes: 1 addition & 1 deletion docs/dev/server-side-search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ You can fix this by deleting the page index and :ref:`re-indexing <server-side-s
.. prompt:: bash

inv docker.manage 'search_index --delete'
inv docker.manage reindex_elasticsearch
inv docker.manage 'reindex_elasticsearch --queue web'

How we index documentations
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
39 changes: 4 additions & 35 deletions readthedocs/builds/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
)
from readthedocs.core.utils import trigger_build
from readthedocs.core.utils.admin import pretty_json_field
from readthedocs.projects.models import HTMLFile
from readthedocs.search.utils import _indexing_helper
from readthedocs.projects.tasks.search import reindex_version


class BuildCommandResultInline(admin.TabularInline):
Expand Down Expand Up @@ -89,7 +88,7 @@ class VersionAdmin(admin.ModelAdmin):
list_filter = ("type", "privacy_level", "active", "built")
search_fields = ("slug", "project__slug")
raw_id_fields = ("project",)
actions = ["build_version", "reindex_version", "wipe_version_indexes"]
actions = ["build_version", "reindex_version"]

def project_slug(self, obj):
return obj.project.slug
Expand Down Expand Up @@ -117,41 +116,11 @@ def build_version(self, request, queryset):
@admin.action(description="Reindex version to ES")
def reindex_version(self, request, queryset):
"""Reindexes all selected versions to ES."""
html_objs_qs = []
for version in queryset.iterator():
html_objs = HTMLFile.objects.filter(
project=version.project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=False)
for version_id in queryset.values_list("id", flat=True).iterator():
reindex_version.delay(version_id)

self.message_user(request, "Task initiated successfully.", messages.SUCCESS)

@admin.action(description="Wipe version from ES")
def wipe_version_indexes(self, request, queryset):
"""Wipe selected versions from ES."""
html_objs_qs = []
for version in queryset.iterator():
html_objs = HTMLFile.objects.filter(
project=version.project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=True)

self.message_user(
request,
"Task initiated successfully",
messages.SUCCESS,
)


@admin.register(RegexAutomationRule)
class RegexAutomationRuleAdmin(PolymorphicChildModelAdmin, admin.ModelAdmin):
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def config(self):
:rtype: dict
"""
last_build = (
self.builds(manager=INTERNAL).filter(
self.builds.filter(
state=BUILD_STATE_FINISHED,
success=True,
).order_by('-date')
Expand Down
22 changes: 22 additions & 0 deletions readthedocs/builds/querysets.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,28 @@ def public(
def api(self, user=None):
return self.public(user, only_active=False)

def for_reindex(self):
"""
Get all versions that can be reindexed.
A version can be reindexed if:
- It's active and has been built at least once successfully.
Since that means that it has files to be indexed.
- Its project is not delisted or marked as spam.
"""
return (
self.filter(
active=True,
built=True,
builds__state=BUILD_STATE_FINISHED,
builds__success=True,
)
.exclude(project__delisted=True)
.exclude(project__is_spam=True)
.distinct()
)


class VersionQuerySet(SettingsOverrideObject):
_default_class = VersionQuerySetBase
Expand Down
57 changes: 8 additions & 49 deletions readthedocs/projects/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from readthedocs.core.history import ExtraSimpleHistoryAdmin, set_change_reason
from readthedocs.core.utils import trigger_build
from readthedocs.notifications.views import SendNotificationView
from readthedocs.projects.tasks.search import reindex_version
from readthedocs.redirects.models import Redirect
from readthedocs.search.utils import _indexing_helper

from .forms import FeatureForm
from .models import (
Expand Down Expand Up @@ -264,7 +264,6 @@ class ProjectAdmin(ExtraSimpleHistoryAdmin):
"run_spam_rule_checks",
"build_default_version",
"reindex_active_versions",
"wipe_all_versions",
"import_tags_from_vcs",
]

Expand Down Expand Up @@ -362,66 +361,26 @@ def reindex_active_versions(self, request, queryset):
"""Reindex all active versions of the selected projects to ES."""
qs_iterator = queryset.iterator()
for project in qs_iterator:
version_qs = Version.internal.filter(project=project)
active_versions = version_qs.filter(active=True)
versions_id_to_reindex = project.versions.for_reindex().values_list(
"pk", flat=True
)

if not active_versions.exists():
if not versions_id_to_reindex.exists():
self.message_user(
request,
"No active versions of project {}".format(project),
"No versions to be re-indexed for project {}".format(project),
messages.ERROR,
)
else:
html_objs_qs = []
for version in active_versions.iterator():
html_objs = HTMLFile.objects.filter(
project=project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=False)
for version_id in versions_id_to_reindex.iterator():
reindex_version.delay(version_id)

self.message_user(
request,
"Task initiated successfully for {}".format(project),
messages.SUCCESS,
)

# TODO: rename method to mention "indexes" on its name
@admin.action(description="Wipe all versions from ES")
def wipe_all_versions(self, request, queryset):
"""Wipe indexes of all versions of selected projects."""
qs_iterator = queryset.iterator()
for project in qs_iterator:
version_qs = Version.internal.filter(project=project)
if not version_qs.exists():
self.message_user(
request,
"No active versions of project {}.".format(project),
messages.ERROR,
)
else:
html_objs_qs = []
for version in version_qs.iterator():
html_objs = HTMLFile.objects.filter(
project=project, version=version
)

if html_objs.exists():
html_objs_qs.append(html_objs)

if html_objs_qs:
_indexing_helper(html_objs_qs, wipe=True)

self.message_user(
request,
"Task initiated successfully for {}.".format(project),
messages.SUCCESS,
)

@admin.action(description="Import tags from the version control API")
def import_tags_from_vcs(self, request, queryset):
for project in queryset.iterator():
Expand Down
10 changes: 2 additions & 8 deletions readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
from ..models import APIProject, WebHookEvent
from ..signals import before_vcs
from .mixins import SyncRepositoryMixin
from .search import fileify
from .search import index_build
from .utils import (
BuildRequest,
clean_build,
Expand Down Expand Up @@ -653,13 +653,7 @@ def on_success(self, retval, task_id, args, kwargs):
)

# Index search data
fileify.delay(
version_pk=self.data.version.pk,
commit=self.data.build['commit'],
build=self.data.build['id'],
search_ranking=self.data.config.search.ranking,
search_ignore=self.data.config.search.ignore,
)
index_build.delay(build_id=self.data.build["id"])

if not self.data.project.has_valid_clone:
self.set_valid_clone()
Expand Down
Loading

0 comments on commit fa54900

Please sign in to comment.