Skip to content

Commit

Permalink
Merge pull request #476 from NASA-IMPACT/candidate-url-model-modeific…
Browse files Browse the repository at this point in the history
…ation

Health check on urls and titles from test and production servers
  • Loading branch information
CarsonDavis authored Oct 25, 2023
2 parents c93dba3 + 4091022 commit f104896
Show file tree
Hide file tree
Showing 9 changed files with 1,486 additions and 4 deletions.
6 changes: 3 additions & 3 deletions .envs/.local/.django
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ DJANGO_AWS_STORAGE_BUCKET_NAME=''

# GitHub (please create a new file called .env and put these in there)
# ------------------------------------------------------------------------------
# GITHUB_ACCESS_TOKEN=''
# SINEQUA_CONFIGS_GITHUB_REPO=''
# GITHUB_BRANCH_FOR_WEBAPP=''
GITHUB_ACCESS_TOKEN=''
SINEQUA_CONFIGS_GITHUB_REPO=''
GITHUB_BRANCH_FOR_WEBAPP=''
7 changes: 6 additions & 1 deletion config_generation/minimum_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,24 @@ def _process_response(self, response) -> dict[str, Any]:

return meaningful_response

def query(self, term: str):
def query(self, term: str, page: int, collection_config_folder=None):
url = f"{self.base_url}/api/v1/search.query"
payload = {
"app": self.app_name,
"query": {
"name": self.query_name,
"action": "search",
"text": term,
"page": page,
"pageSize": 1000,
"tab": "all",
},
"pretty": "true",
}

if collection_config_folder:
payload["query"]["collection"] = f"/SMD/{collection_config_folder}/"

response = requests.post(url, json=payload, verify=False)

return self._process_response(response)
Expand Down
1,290 changes: 1,290 additions & 0 deletions jupyter_notebooks/health_check_on_urls_titles.ipynb

Large diffs are not rendered by default.

76 changes: 76 additions & 0 deletions scripts/health_checks_on_urls_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from sde_collections.models.candidate_url import CandidateURL
from sde_collections.models.collection import Collection
from sde_collections.sinequa_api import Api


def _health_check_on_urls_titles(server_name: str):
if server_name == "test":
url_field = "download_url"
status_field = "present_on_test"
title_field = "test_title"
elif server_name == "production":
url_field = "url1"
status_field = "present_on_prod"
title_field = "production_title"
else:
# Handle invalid server name
raise ValueError(f"Invalid server name: {server_name}")

api = Api(server_name=server_name)

collection_config_folders = [
collection.config_folder for collection in Collection.objects.all()
]

for collection_config_folder in collection_config_folders:
page = 1
urls_server_info_dict = {}
while True:
response = api.query(
page=page, collection_config_folder=collection_config_folder
)
if (
response.get("cursorRowCount", 0) == 0
): # Safeguard against missing 'cursorRowCount'
break
for record in response.get(
"records", []
): # Safeguard against missing 'records'
url = record.get(url_field)
title = record.get("title")
if url and title: # Ensure both url and title are present
urls_server_info_dict[url] = {"title": title}
page += 1
print(
f"Finished collecting URLs from {server_name} server for config folder {collection_config_folder}"
)

collection_object = Collection.objects.filter(
config_folder=collection_config_folder
)
candidate_urls_objects = CandidateURL.objects.filter(
collection=collection_object[0]
)
for candidate_urls_object in candidate_urls_objects:
is_present_on_server = (
candidate_urls_object.url in urls_server_info_dict.keys()
)
if getattr(candidate_urls_object, status_field) != is_present_on_server:
setattr(candidate_urls_object, status_field, is_present_on_server)
try:
setattr(
candidate_urls_object,
title_field,
urls_server_info_dict.get(candidate_urls_object.url)["title"],
)
except TypeError:
setattr(candidate_urls_object, title_field, "Unavailable")
candidate_urls_object.save()
print(
f"Finished updating urls within collection config folder {collection_config_folder}"
)


if __name__ == "__main__":
_health_check_on_urls_titles(server_name="test")
_health_check_on_urls_titles(server_name="production")
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Generated by Django 4.2.6 on 2023-10-25 14:50

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("sde_collections", "0035_alter_candidateurl_unique_together"),
]

operations = [
migrations.AddField(
model_name="candidateurl",
name="present_on_prod",
field=models.BooleanField(
default=False,
help_text="Helps keep track if the Current URL is present in production or not",
verbose_name="URL Present In Production?",
),
),
migrations.AddField(
model_name="candidateurl",
name="present_on_test",
field=models.BooleanField(
default=False,
help_text="Helps keep track if the Current URL is present in test environment or not",
verbose_name="URL Present In Test Environment?",
),
),
migrations.AddField(
model_name="candidateurl",
name="production_title",
field=models.CharField(
blank=True,
default="",
help_text="This is the title present on Production Server",
verbose_name="Title on Production Server",
),
),
migrations.AddField(
model_name="candidateurl",
name="test_title",
field=models.CharField(
blank=True,
default="",
help_text="This is the title present on Test Server",
verbose_name="Title on Test Server",
),
),
]
22 changes: 22 additions & 0 deletions sde_collections/models/candidate_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,18 @@ class CandidateURL(models.Model):
blank=True,
help_text="This is the title generated based on a Title Pattern",
)
test_title = models.CharField(
"Title on Test Server",
default="",
blank=True,
help_text="This is the title present on Test Server",
)
production_title = models.CharField(
"Title on Production Server",
default="",
blank=True,
help_text="This is the title present on Production Server",
)
level = models.IntegerField(
"Level", default=0, blank=True, help_text="Level in the tree. Based on /."
)
Expand All @@ -59,6 +71,16 @@ class CandidateURL(models.Model):
default=False,
help_text="This keeps track of whether the given url is pdf or not",
)
present_on_test = models.BooleanField(
"URL Present In Test Environment?",
default=False,
help_text="Helps keep track if the Current URL is present in test environment or not",
)
present_on_prod = models.BooleanField(
"URL Present In Production?",
default=False,
help_text="Helps keep track if the Current URL is present in production or not",
)

class Meta:
"""Meta definition for Candidate URL."""
Expand Down
4 changes: 4 additions & 0 deletions sde_collections/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ class Meta:
"document_type",
"document_type_display",
"visited",
"test_title",
"production_title",
"present_on_test",
"present_on_prod",
)


Expand Down
31 changes: 31 additions & 0 deletions sde_indexing_helper/static/js/candidate_url_list.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ function initializeDataTable() {
"columns": [
getURLColumn(),
getExcludedColumn(true_icon, false_icon),
getIsPresentOnTestColumn(true_icon, false_icon),
getIsPresentInProductionColumn(true_icon, false_icon),
getScrapedTitleColumn(),
getGeneratedTitleColumn(),
getTestTitleColumn(),
getProdTitleColumn(),
getVisitedColumn(true_icon, false_icon),
getDocumentTypeColumn(),
{ "data": "id", "visible": false, "searchable": false },
Expand Down Expand Up @@ -161,6 +165,17 @@ function getGeneratedTitleColumn() {
}
}

function getTestTitleColumn() {
return {
"data": "test_title"
}
}

function getProdTitleColumn() {
return {
"data": "production_title"
}
}

function getExcludedColumn(true_icon, false_icon) {
return {
Expand All @@ -180,6 +195,22 @@ function getVisitedColumn(true_icon, false_icon) {
}
}

function getIsPresentOnTestColumn(true_icon, false_icon){
return {
"data": "present_on_test", "class": "col-1 text-center", "render": function (data, type, row) {
return (data === true) ? true_icon : false_icon
}
}
}

function getIsPresentInProductionColumn(true_icon, false_icon){
return {
"data": "present_on_prod", "class": "col-1 text-center", "render": function (data, type, row) {
return (data === true) ? true_icon : false_icon
}
}
}

function getDocumentTypeColumn() {
return {
"data": "document_type", "render": function (data, type, row) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ <h3>
<tr>
<th scope="col" class="text-center col-1">URL</th>
<th scope="col" class="text-center col-1">Exclude</th>
<th scope="col" class="text-center col-1">URL on Test Server?</th>
<th scope="col" class="text-center col-1">URL in Production?</th>
<th scope="col" class="text-center col-1">Scraped Title</th>
<th scope="col" class="text-center col-1">New Title</th>
<th scope="col" class="text-center col-1">Title on Test Server</th>
<th scope="col" class="text-center col-1">Title in Production</th>
<th scope="col" class="text-center col-1">Visited?</th>
<th scope="col" class="text-center col-1">Document Type</th>
<th scope="col" class="text-center col-1">ID</th>
Expand Down

0 comments on commit f104896

Please sign in to comment.