diff --git a/docs/dev/server-side-search.rst b/docs/dev/server-side-search.rst index 56e1636e58d..18b7c82d612 100644 --- a/docs/dev/server-side-search.rst +++ b/docs/dev/server-side-search.rst @@ -62,7 +62,7 @@ You can fix this by deleting the page index and :ref:`re-indexing + + + + + 404 - Page not found + + + Page not found + + diff --git a/readthedocs/rtd_tests/files/index.html b/readthedocs/rtd_tests/files/index.html new file mode 100644 index 00000000000..03363c621ca --- /dev/null +++ b/readthedocs/rtd_tests/files/index.html @@ -0,0 +1,11 @@ + + + + + + Root index file + + + Root index file + + diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 78e138e7b7e..0d44ed72595 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -58,9 +58,11 @@ def test_sync_directory(self): tree = [ ('api', ['index.html']), - 'api.fjson', - 'conf.py', - 'test.html', + "404.html", + "api.fjson", + "conf.py", + "index.html", + "test.html", ] with override_settings(DOCROOT=tmp_files_dir): self.storage.rclone_sync_directory(tmp_files_dir, storage_dir) @@ -68,7 +70,9 @@ def test_sync_directory(self): tree = [ ('api', ['index.html']), + "404.html", 'conf.py', + "index.html", 'test.html', ] os.remove(os.path.join(tmp_files_dir, 'api.fjson')) @@ -77,7 +81,9 @@ def test_sync_directory(self): self.assertFileTree(storage_dir, tree) tree = [ + "404.html", 'conf.py', + "index.html", 'test.html', ] shutil.rmtree(os.path.join(tmp_files_dir, 'api')) @@ -126,7 +132,9 @@ def test_delete_directory(self): self.storage.copy_directory(files_dir, "files") dirs, files = self.storage.listdir("files") self.assertEqual(dirs, ["api"]) - self.assertCountEqual(files, ["api.fjson", "conf.py", "test.html"]) + self.assertCountEqual( + files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"] + ) self.storage.delete_directory('files/') _, files = self.storage.listdir('files') @@ -147,9 +155,11 @@ def test_walk(self): self.assertEqual(len(output), 2) top, dirs, files = output[0] - self.assertEqual(top, 'files') - self.assertCountEqual(dirs, ['api']) - self.assertCountEqual(files, ['api.fjson', 'conf.py', 'test.html']) + self.assertEqual(top, "files") + self.assertCountEqual(dirs, ["api"]) + self.assertCountEqual( + files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"] + ) top, dirs, files = output[1] self.assertEqual(top, 'files/api') @@ -163,8 +173,10 @@ def test_rclone_sync(self): tree = [ ("api", ["index.html"]), + "404.html", "api.fjson", "conf.py", + "index.html", "test.html", ] with override_settings(DOCROOT=tmp_files_dir): @@ -173,7 +185,9 @@ def test_rclone_sync(self): tree = [ ("api", ["index.html"]), + "404.html", "conf.py", + "index.html", "test.html", ] (tmp_files_dir / "api.fjson").unlink() @@ -182,7 +196,9 @@ def test_rclone_sync(self): self.assertFileTree(storage_dir, tree) tree = [ + "404.html", "conf.py", + "index.html", "test.html", ] shutil.rmtree(tmp_files_dir / "api") diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 4e1aea8ce9f..82e25f98b88 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -6,14 +6,13 @@ from django.test.utils import override_settings from readthedocs.projects.models import HTMLFile, ImportedFile, Project -from readthedocs.projects.tasks.search import ( - _create_imported_files, - _sync_imported_files, -) +from readthedocs.projects.tasks.search import _create_imported_files_and_search_index +from readthedocs.search.documents import PageDocument base_dir = os.path.dirname(os.path.dirname(__file__)) +@override_settings(ELASTICSEARCH_DSL_AUTOSYNC=True) class ImportedFileTests(TestCase): fixtures = ["eric", "test_data"] @@ -27,20 +26,22 @@ def setUp(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - def _manage_imported_files( - self, version, commit, build, search_ranking=None, search_ignore=None - ): + def tearDown(self): + try: + PageDocument().search().filter().delete() + except Exception: + # If there are no documents, the query fails. + pass + + def _manage_imported_files(self, version, search_ranking=None, search_ignore=None): """Helper function for the tests to create and sync ImportedFiles.""" search_ranking = search_ranking or {} search_ignore = search_ignore or [] - _create_imported_files( + return _create_imported_files_and_search_index( version=version, - commit=commit, - build=build, search_ranking=search_ranking, search_ignore=search_ignore, ) - _sync_imported_files(version, build) def _copy_storage_dir(self): """Copy the test directory (rtd_tests/files) to storage""" @@ -54,96 +55,209 @@ def _copy_storage_dir(self): ) def test_properly_created(self): - # Only 2 files in the directory is HTML (test.html, api/index.html) + """ + Only 4 files in the directory are HTML + + - index.html + - test.html + - api/index.html + - 404.html + + But we create imported files for index.html and 404.html files only. + """ self.assertEqual(ImportedFile.objects.count(), 0) - self._manage_imported_files(version=self.version, commit="commit01", build=1) - self.assertEqual(ImportedFile.objects.count(), 2) - self._manage_imported_files(version=self.version, commit="commit01", build=2) - self.assertEqual(ImportedFile.objects.count(), 2) - self.project.cdn_enabled = True - self.project.save() + sync_id = self._manage_imported_files(version=self.version) + self.assertEqual(ImportedFile.objects.count(), 3) + self.assertEqual( + set(HTMLFile.objects.all().values_list("path", flat=True)), + {"index.html", "api/index.html", "404.html"}, + ) + + results = PageDocument().search().filter("term", build=sync_id).execute() + self.assertEqual( + {result.path for result in results}, + {"index.html", "404.html", "test.html", "api/index.html"}, + ) + + sync_id = self._manage_imported_files(version=self.version) + self.assertEqual(ImportedFile.objects.count(), 3) + self.assertEqual( + set(HTMLFile.objects.all().values_list("path", flat=True)), + {"index.html", "api/index.html", "404.html"}, + ) + + results = PageDocument().search().filter("term", build=sync_id).execute() + self.assertEqual( + {result.path for result in results}, + {"index.html", "404.html", "test.html", "api/index.html"}, + ) - def test_update_commit(self): + def test_update_build(self): self.assertEqual(ImportedFile.objects.count(), 0) - self._manage_imported_files(self.version, "commit01", 1) - self.assertEqual(ImportedFile.objects.first().commit, "commit01") - self._manage_imported_files(self.version, "commit02", 2) - self.assertEqual(ImportedFile.objects.first().commit, "commit02") + sync_id = self._manage_imported_files(self.version) + for obj in ImportedFile.objects.all(): + self.assertEqual(obj.build, sync_id) + + results = PageDocument().search().filter().execute() + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.build, sync_id) + + sync_id = self._manage_imported_files(self.version) + for obj in ImportedFile.objects.all(): + self.assertEqual(obj.build, sync_id) + + # NOTE: we can't test that the files from the previous build + # were deleted, since deletion happens asynchronously. + results = PageDocument().search().filter("term", build=sync_id).execute() + self.assertEqual(len(results), 4) def test_page_default_rank(self): search_ranking = {} self.assertEqual(HTMLFile.objects.count(), 0) - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - self.assertEqual(HTMLFile.objects.filter(rank=0).count(), 2) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, sync_id) + self.assertEqual(result.rank, 0) def test_page_custom_rank_glob(self): search_ranking = { "*index.html": 5, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 5) - self.assertEqual(file_test.rank, 0) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, sync_id) + if result.path.endswith("index.html"): + self.assertEqual(result.rank, 5, result.path) + else: + self.assertEqual(result.rank, 0, result.path) def test_page_custom_rank_several(self): search_ranking = { "test.html": 5, "api/index.html": 2, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 2) - self.assertEqual(file_test.rank, 5) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, sync_id) + if result.path == "test.html": + self.assertEqual(result.rank, 5) + elif result.path == "api/index.html": + self.assertEqual(result.rank, 2) + else: + self.assertEqual(result.rank, 0) def test_page_custom_rank_precedence(self): search_ranking = { "*.html": 5, "api/index.html": 2, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 2) - self.assertEqual(file_test.rank, 5) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, sync_id) + if result.path == "api/index.html": + self.assertEqual(result.rank, 2, result.path) + else: + self.assertEqual(result.rank, 5, result.path) def test_page_custom_rank_precedence_inverted(self): search_ranking = { "api/index.html": 2, "*.html": 5, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 5) - self.assertEqual(file_test.rank, 5) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, sync_id) + self.assertEqual(result.rank, 5) def test_search_page_ignore(self): search_ignore = ["api/index.html"] self._manage_imported_files( self.version, - "commit01", - 1, search_ignore=search_ignore, ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertTrue(file_api.ignore) - self.assertFalse(file_test.ignore) + self.assertEqual( + set(HTMLFile.objects.all().values_list("path", flat=True)), + {"index.html", "api/index.html", "404.html"}, + ) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 3) + self.assertEqual( + {result.path for result in results}, {"index.html", "404.html", "test.html"} + ) def test_update_content(self): test_dir = os.path.join(base_dir, "files") @@ -155,8 +269,18 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - self._manage_imported_files(self.version, "commit01", 1) - self.assertEqual(ImportedFile.objects.count(), 2) + sync_id = self._manage_imported_files(self.version) + self.assertEqual(ImportedFile.objects.count(), 3) + document = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .filter("term", path="test.html") + .filter("term", build=sync_id) + .execute()[0] + ) + self.assertEqual(document.sections[0].content, "Woo") with open(os.path.join(test_dir, "test.html"), "w+") as f: f.write("Something Else") @@ -164,5 +288,15 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - self._manage_imported_files(self.version, "commit02", 2) - self.assertEqual(ImportedFile.objects.count(), 2) + sync_id = self._manage_imported_files(self.version) + self.assertEqual(ImportedFile.objects.count(), 3) + document = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .filter("term", path="test.html") + .filter("term", build=sync_id) + .execute()[0] + ) + self.assertEqual(document.sections[0].content, "Something Else") diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index 50e412682d7..514c6a39299 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -1,9 +1,9 @@ import itertools -import structlog import sys import textwrap from datetime import datetime, timedelta +import structlog from django.apps import apps from django.conf import settings from django.core.management import BaseCommand @@ -11,6 +11,8 @@ from readthedocs.builds.models import Version from readthedocs.projects.models import HTMLFile, Project +from readthedocs.projects.tasks.search import reindex_version +from readthedocs.search.documents import PageDocument, ProjectDocument from readthedocs.search.tasks import ( create_new_es_index, index_objects_to_es, @@ -45,46 +47,20 @@ def _get_indexing_tasks( yield index_objects_to_es.si(**data) def _run_reindex_tasks(self, models, queue): - apply_async_kwargs = {"queue": queue} log.info("Adding indexing tasks to queue.", queue=queue) - timestamp = datetime.now().strftime("%Y%m%d%H%M%S") - for doc in registry.get_documents(models): - queryset = doc().get_queryset() - - app_label = queryset.model._meta.app_label - model_name = queryset.model.__name__ - - index_name = doc._index._name - new_index_name = "{}_{}".format(index_name, timestamp) - - # Set and create a temporal index for indexing. - create_new_es_index( - app_label=app_label, - model_name=model_name, - index_name=index_name, - new_index_name=new_index_name, - ) - doc._index._name = new_index_name - log.info("Temporal index created.", index_name=new_index_name) - - indexing_tasks = self._get_indexing_tasks( - app_label=app_label, - model_name=model_name, - queryset=queryset, - index_name=new_index_name, - document_class=str(doc), - ) - for task in indexing_tasks: - task.apply_async(**apply_async_kwargs) + for model in models: + if model == HTMLFile: + self._reindex_files(queue=queue, timestamp=timestamp) + elif model == Project: + self._reindex_projects(queue=queue, timestamp=timestamp) + else: + log.warning( + "Re-index not available for model.", model_name=model.__name__ + ) + continue - log.info( - "Tasks issued successfully.", - model_name=model_name, - app_label=app_label, - items=queryset.count(), - ) return timestamp def _change_index(self, models, timestamp): @@ -110,10 +86,9 @@ def _change_index(self, models, timestamp): def _reindex_from(self, days_ago, models, queue): functions = { - apps.get_model("projects.HTMLFile"): self._reindex_files_from, - apps.get_model("projects.Project"): self._reindex_projects_from, + HTMLFile: self._reindex_files_from, + Project: self._reindex_projects_from, } - models = models or functions.keys() for model in models: if model not in functions: log.warning( @@ -122,6 +97,42 @@ def _reindex_from(self, days_ago, models, queue): continue functions[model](days_ago=days_ago, queue=queue) + def _reindex_projects(self, queue, timestamp): + document = ProjectDocument + app_label = Project._meta.app_label + model_name = Project.__name__ + index_name = document._index._name + new_index_name = "{}_{}".format(index_name, timestamp) + + create_new_es_index( + app_label=app_label, + model_name=model_name, + index_name=index_name, + new_index_name=new_index_name, + ) + log.info("Temporal index created.", index_name=new_index_name) + + queryset = document().get_queryset() + indexing_tasks = self._get_indexing_tasks( + app_label=app_label, + model_name=model_name, + queryset=queryset, + index_name=new_index_name, + document_class=str(document), + ) + number_of_tasks = 0 + for task in indexing_tasks: + task.apply_async(queue=queue) + number_of_tasks += 1 + + log.info( + "Tasks issued successfully.", + model_name=model_name, + app_label=app_label, + items=queryset.count(), + number_of_tasks=number_of_tasks, + ) + def _reindex_projects_from(self, days_ago, queue): """Reindex projects with recent changes.""" since = datetime.now() - timedelta(days=days_ago) @@ -147,51 +158,53 @@ def _reindex_projects_from(self, days_ago, queue): items=queryset.count(), ) - def _reindex_files_from(self, days_ago, queue): - """Reindex HTML files from versions with recent builds.""" - chunk_size = settings.ES_TASK_CHUNK_SIZE - since = datetime.now() - timedelta(days=days_ago) - queryset = Version.objects.filter(builds__date__gte=since).distinct() + def _reindex_files(self, queue, timestamp): + document = PageDocument app_label = HTMLFile._meta.app_label model_name = HTMLFile.__name__ - apply_async_kwargs = { - "queue": queue, - "kwargs": { - "app_label": app_label, - "model_name": model_name, - }, - } + index_name = document._index._name + new_index_name = "{}_{}".format(index_name, timestamp) + create_new_es_index( + app_label=app_label, + model_name=model_name, + index_name=index_name, + new_index_name=new_index_name, + ) + log.info("Temporal index created.", index_name=new_index_name) - for doc in registry.get_documents(models=[HTMLFile]): - apply_async_kwargs["kwargs"]["document_class"] = str(doc) - for version in queryset.iterator(): - project = version.project - files_qs = ( - HTMLFile.objects.filter(version=version) - .values_list("pk", flat=True) - .iterator() - ) - current = 0 - while True: - objects_id = list(itertools.islice(files_qs, chunk_size)) - if not objects_id: - break - current += len(objects_id) - log.info( - "Re-indexing files.", - version_slug=version.slug, - project_slug=project.slug, - items=current, - ) - apply_async_kwargs["kwargs"]["objects_id"] = objects_id - index_objects_to_es.apply_async(**apply_async_kwargs) + queryset = Version.objects.for_reindex().values_list("pk", flat=True) + for version_id in queryset.iterator(): + reindex_version.apply_async( + kwargs={ + "version_id": version_id, + "search_index_name": new_index_name, + }, + queue=queue, + ) - log.info( - "Tasks issued successfully.", - project_slug=project.slug, - version_slug=version.slug, - items=current, - ) + log.info( + "Tasks issued successfully for re-indexing of versions.", + number_of_tasks=queryset.count(), + ) + + def _reindex_files_from(self, days_ago, queue): + """Reindex HTML files from versions with recent builds.""" + since = datetime.now() - timedelta(days=days_ago) + queryset = ( + Version.objects.for_reindex() + .filter(builds__date__gte=since) + .values_list("pk", flat=True) + ) + for version_id in queryset.iterator(): + reindex_version.apply_async( + kwargs={"version_id": version_id}, + queue=queue, + ) + + log.info( + "Tasks issued successfully for re-indexing of versions.", + number_of_tasks=queryset.count(), + ) def add_arguments(self, parser): parser.add_argument( @@ -239,9 +252,10 @@ def handle(self, *args, **options): `--model .` parameter. Otherwise, it will re-index all the models """ - models = None if options["models"]: models = [apps.get_model(model_name) for model_name in options["models"]] + else: + models = [Project, HTMLFile] queue = options["queue"] change_index = options["change_index"] diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 123b0bbd652..3f4a868666f 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -21,7 +21,6 @@ SECTION_FIELDS, get_search_query_from_project_file, ) -from readthedocs.search.utils import index_new_files, remove_indexed_files @pytest.mark.django_db @@ -726,64 +725,6 @@ def test_search_custom_ranking(self, api_client): assert results[0]["path"] == "/en/latest/index.html" assert results[1]["path"] == "/en/latest/guides/index.html" - def test_search_ignore(self, api_client): - project = Project.objects.get(slug="docs") - version = project.versions.all().first() - - page_index = HTMLFile.objects.get( - version=version, - path="index.html", - ) - page_guides = HTMLFile.objects.get( - version=version, - path="guides/index.html", - ) - - search_params = { - "project": project.slug, - "version": version.slug, - "q": '"content from"', - } - - # Query with files not ignored. - assert page_index.ignore is None - assert page_guides.ignore is None - - resp = self.get_search(api_client, search_params) - assert resp.status_code == 200 - - results = resp.data["results"] - assert len(results) == 2 - assert results[0]["path"] == "/en/latest/index.html" - assert results[1]["path"] == "/en/latest/guides/index.html" - - # Query with guides/index.html ignored. - page_guides.ignore = True - page_guides.save() - - remove_indexed_files(HTMLFile, project.slug, version.slug) - index_new_files(HTMLFile, version, page_index.build) - - resp = self.get_search(api_client, search_params) - assert resp.status_code == 200 - - results = resp.data["results"] - assert len(results) == 1 - assert results[0]["path"] == "/en/latest/index.html" - - # Query with index.html and guides/index.html ignored. - page_index.ignore = True - page_index.save() - - remove_indexed_files(HTMLFile, project.slug, version.slug) - index_new_files(HTMLFile, version, page_index.build) - - resp = self.get_search(api_client, search_params) - assert resp.status_code == 200 - - results = resp.data["results"] - assert len(results) == 0 - class TestDocumentSearch(BaseTestDocumentSearch): pass diff --git a/readthedocs/search/tests/test_utils.py b/readthedocs/search/tests/test_utils.py index 21583603534..8d7429f8b5f 100644 --- a/readthedocs/search/tests/test_utils.py +++ b/readthedocs/search/tests/test_utils.py @@ -4,7 +4,6 @@ from django.urls import reverse from readthedocs.builds.constants import LATEST, STABLE -from readthedocs.projects.models import HTMLFile from readthedocs.search import utils from readthedocs.search.tests.utils import get_search_query_from_project_file @@ -33,10 +32,7 @@ def test_remove_only_one_project_index(self, api_client, all_projects): assert self.has_results(api_client, project, LATEST) assert self.has_results(api_client, project, STABLE) - utils.remove_indexed_files( - HTMLFile, - project_slug=project, - ) + utils.remove_indexed_files(project_slug=project) # Deletion of indices from ES happens async, # so we need to wait a little before checking for results. time.sleep(1) @@ -55,7 +51,6 @@ def test_remove_only_one_version_index(self, api_client, all_projects): assert self.has_results(api_client, project, STABLE) utils.remove_indexed_files( - HTMLFile, project_slug=project, version_slug=LATEST, ) diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index daf6d47d790..4dd10b49e39 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -5,36 +5,32 @@ from django_elasticsearch_dsl.apps import DEDConfig from django_elasticsearch_dsl.registries import registry -from readthedocs.projects.models import HTMLFile +from readthedocs.search.documents import PageDocument log = structlog.get_logger(__name__) -def index_new_files(model, version, build): - """Index new files from the version into the search index.""" - - log.bind( - project_slug=version.project.slug, - version_slug=version.slug, - ) - +def index_objects(document, objects, index_name=None): if not DEDConfig.autosync_enabled(): - log.info("Autosync disabled. Skipping indexing into the search index.") + log.info("Autosync disabled, skipping searh indexing.") return - try: - document = list(registry.get_documents(models=[model]))[0] - doc_obj = document() - queryset = doc_obj.get_queryset().filter( - project=version.project, version=version, build=build - ) - log.info("Indexing new objecst into search index.") - doc_obj.update(queryset.iterator()) - except Exception: - log.exception("Unable to index a subset of files. Continuing.") + # If a search index name is provided, we need to temporarily change + # the index name of the document. + old_index_name = document._index._name + if index_name: + document._index._name = index_name + + document().update(objects) + + # Restore the old index name. + if index_name: + document._index._name = old_index_name -def remove_indexed_files(model, project_slug, version_slug=None, build_id=None): +def remove_indexed_files( + project_slug, version_slug=None, sync_id=None, index_name=None +): """ Remove files from `version_slug` of `project_slug` from the search index. @@ -54,18 +50,28 @@ def remove_indexed_files(model, project_slug, version_slug=None, build_id=None): log.info("Autosync disabled, skipping removal from the search index.") return + # If a search index name is provided, we need to temporarily change + # the index name of the document. + document = PageDocument + old_index_name = document._index._name + if index_name: + document._index._name = index_name + try: - document = list(registry.get_documents(models=[model]))[0] log.info("Deleting old files from search index.") documents = document().search().filter("term", project=project_slug) if version_slug: documents = documents.filter("term", version=version_slug) - if build_id: - documents = documents.exclude("term", build=build_id) + if sync_id: + documents = documents.exclude("term", build=sync_id) documents.delete() except Exception: log.exception("Unable to delete a subset of files. Continuing.") + # Restore the old index name. + if index_name: + document._index._name = old_index_name + def _get_index(indices, index_name): """ @@ -95,38 +101,6 @@ def _get_document(model, document_class): return document -def _indexing_helper(html_objs_qs, wipe=False): - """ - Helper function for reindexing and wiping indexes of projects and versions. - - If ``wipe`` is set to False, html_objs are deleted from the ES index, - else, html_objs are indexed. - """ - from readthedocs.search.documents import PageDocument - from readthedocs.search.tasks import delete_objects_in_es, index_objects_to_es - - if html_objs_qs: - obj_ids = [] - for html_objs in html_objs_qs: - obj_ids.extend([obj.id for obj in html_objs]) - - # removing redundant ids if exists. - obj_ids = list(set(obj_ids)) - - if obj_ids: - kwargs = { - "app_label": HTMLFile._meta.app_label, - "model_name": HTMLFile.__name__, - "document_class": str(PageDocument), - "objects_id": obj_ids, - } - - if not wipe: - index_objects_to_es.delay(**kwargs) - else: - delete_objects_in_es.delay(**kwargs) - - def _last_30_days_iter(): """Returns iterator for previous 30 days (including today).""" thirty_days_ago = timezone.now().date() - timezone.timedelta(days=30)